src/operator/tensor/indexing_op.cc

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file indexing_op.cc
 * \brief CPU implementation of indexing operator
 * \author Siyi Li, Chi Zhang
 */

#include "./indexing_op.h"
namespace mxnet {
namespace op {

template <bool clip = true>
struct TakeZeroAxisCPU {
  // assume that idx have been flattened to a 1-D tensor (N,)
  // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
  // M is the number of columns of in_data and out_data
  // K is the number of rows of in_data
  // i is the index of out_data
  template <typename DType, typename IType>
  MSHADOW_XINLINE static void Map(index_t i,
                                  DType* out_data,
                                  const DType* in_data,
                                  const IType* idx,
                                  const size_t M,
                                  const int64_t K) {
    int64_t j = static_cast<int64_t>(idx[i]);
    if (clip) {
      if (j <= 0)
        j = 0;
      else if (j >= K)
        j = K - 1;
    } else {
      j = j % K;
      j += (j < 0) ? K : 0;
    }
#pragma GCC diagnostic push
#if __GNUC__ >= 8
#pragma GCC diagnostic ignored "-Wclass-memaccess"
#endif
    std::memcpy(out_data + i * M, in_data + j * M, M * sizeof(DType));
#pragma GCC diagnostic pop
  }
};

template <bool clip = true>
struct TakeNonzeroAxisCPU {
  /*!
   * \brief Map function for take operator
   * \param i                 global thread id
   * \param out_data          ptr to output buffer
   * \param in_data           ptr to input buffer
   * \param indices           ptr to indices buffer
   * \param outer_dim_stride  stride of dimension before axis
   * \param axis_dim_stride   stride of axis dimension
   * \param idx_size          size of the indices tensor
   * \param axis_dim          dim size of the axis dimension
   * \param axis              axis id
   */
  template <typename DType, typename IType>
  MSHADOW_XINLINE static void Map(index_t i,
                                  DType* out_data,
                                  const DType* in_data,
                                  const IType* indices,
                                  const index_t outer_dim_stride,
                                  const index_t axis_dim_stride,
                                  const int idx_size,
                                  const int axis_dim,
                                  const int axis) {
    for (index_t j = 0; j < static_cast<index_t>(idx_size); ++j) {
      int index = indices[j];
      if (clip) {
        index = std::max(index, 0);
        index = std::min(axis_dim - 1, index);
      } else {
        index %= axis_dim;
        index += (index < 0) ? axis_dim : 0;
      }
      size_t in_offset  = i * outer_dim_stride + index * axis_dim_stride;
      size_t out_offset = (i * idx_size + j) * axis_dim_stride;
#pragma GCC diagnostic push
#if __GNUC__ >= 8
#pragma GCC diagnostic ignored "-Wclass-memaccess"
#endif
      std::memcpy(out_data + out_offset, in_data + in_offset, axis_dim_stride * sizeof(DType));
#pragma GCC diagnostic pop
    }
  }
};

/*
 * \brief returns true if all indices are between [min, max]
 * \param data_ptr the indices to check
 * \param data_size the number of indices to examine
 * \param min the expected min value for indices
 * \param max the expected max value for indices
 */
template <typename DType>
bool CheckIndexOutOfBound(const DType* data_ptr,
                          size_t data_size,
                          const DType min,
                          const DType max) {
  bool is_valid = true;
  for (size_t i = 0; i < data_size; i++) {
    if (data_ptr[i] > max || data_ptr[i] < min) {
      is_valid = false;
      break;
    }
  }
  return is_valid;
}

// Embedding forward implementation with dense weight
template <>
void EmbeddingOpForwardDnsImpl<cpu>(mshadow::Stream<cpu>* s,
                                    const TBlob& data,
                                    const TBlob& weight,
                                    const OpReqType req,
                                    const TBlob& output) {
  using namespace mxnet_op;
  const mxnet::TShape& ishape = data.shape_;
  const mxnet::TShape& oshape = output.shape_;

  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
      Tensor<cpu, 1, IType> idx =
          data.get_with_shape<cpu, 1, IType>(Shape1(ishape.ProdShape(0, ishape.ndim())), s);
      Tensor<cpu, 2, DType> wmat = weight.get<cpu, 2, DType>(s);
      Tensor<cpu, 2, DType> out  = output.get_with_shape<cpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim() - 1), oshape[oshape.ndim() - 1]), s);
      Kernel<TakeZeroAxisCPU<true>, cpu>::Launch(s,
                                                 oshape.Size() / wmat.shape_[1],
                                                 out.dptr_,
                                                 wmat.dptr_,
                                                 idx.dptr_,
                                                 wmat.shape_[1],
                                                 wmat.shape_[0]);
    });
  });
}

template <>
void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
                                          const TBlob& data,
                                          const NDArray& weight,
                                          const OpReqType req,
                                          const TBlob& output) {
  if (req == kNullOp)
    return;
  using namespace rowsparse;
  using namespace mxnet_op;
  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
  // zeros weight
  if (req == kWriteTo && !weight.storage_initialized()) {
    size_t out_size = output.shape_.Size();
    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
      Fill<false>(
          s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size), cpu::kDevMask), kWriteTo, 0);
    })
    return;
  }
  // check out-of-bound indices
  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
    DType min = 0;
    DType max = static_cast<DType>(weight.shape()[0] - 1);
    // check with single thread is faster since data is small
    DType* data_ptr  = data.dptr<DType>();
    size_t data_size = data.shape_.Size();
    bool is_valid    = CheckIndexOutOfBound(data_ptr, data_size, min, max);
    CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
  })
  // the weight is actually dense
  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
    EmbeddingOpForwardDnsImpl<cpu>(s, data, weight.data(), req, output);
  } else {
    EmbeddingOpForwardRspImpl<cpu>(s, data, weight, req, output);
  }
}

template <bool clip>
struct CsrTakeDataKernel {
  /*!
   * \brief Map function for general case of take grad
   * \param tid           global thread id
   * \param out_idx       ptr to out idx
   * \param out_data      ptr to out data
   * \param out_indptr    ptr to out indptr
   * \param src_data      ptr to original csr data
   * \param src_idx       ptr to original csr idx
   * \param idx_ptr       ptr to indices
   * \param num_rows      maximum number of rows in src array
   */
  template <typename IType, typename DType, typename RType>
  MSHADOW_XINLINE static void Map(int tid,
                                  RType* out_idx,
                                  DType* out_data,
                                  const RType* out_indptr,
                                  const RType* src_idx,
                                  const DType* src_data,
                                  const RType* src_indptr,
                                  const IType* idx_ptr,
                                  const nnvm::dim_t num_rows) {
    nnvm::dim_t idx = static_cast<nnvm::dim_t>(idx_ptr[tid]);
    // clip mode
    if (clip) {
      if (idx < 0)
        idx = 0;
      if (idx >= num_rows)
        idx = num_rows - 1;
    } else {
      // wrap mode
      idx = idx % num_rows;
      idx += (idx < 0) ? num_rows : 0;
    }
    int row_nnz = src_indptr[idx + 1] - src_indptr[idx];
    for (int i = 0; i < row_nnz; i++) {
      out_data[out_indptr[tid] + i] = src_data[src_indptr[idx] + i];
      out_idx[out_indptr[tid] + i]  = src_idx[src_indptr[idx] + i];
    }
  }
};

template <bool clip>
struct CsrTakeRowCountKernel {
  /*!
   * \brief Map function for general case of take grad
   * \param tid           global thread id
   * \param out_indptr    ptr to out indptr
   * \param src_indptr    ptr to original csr indptr
   * \param idx_ptr       ptr to indices
   * \param num_rows      maximum number of rows in src array
   */
  template <typename IType, typename RType>
  MSHADOW_XINLINE static void Map(int tid,
                                  RType* out_indptr,
                                  const RType* src_indptr,
                                  const IType* idx_ptr,
                                  const nnvm::dim_t num_rows) {
    if (tid == 0) {
      out_indptr[0] = 0;
      return;
    }
    nnvm::dim_t idx = static_cast<nnvm::dim_t>(idx_ptr[tid - 1]);
    // clip mode
    if (clip) {
      if (idx < 0)
        idx = 0;
      if (idx >= num_rows)
        idx = num_rows - 1;
    } else {
      // wrap mode
      idx = idx % num_rows;
      idx += (idx < 0) ? num_rows : 0;
    }
    out_indptr[tid] = src_indptr[idx + 1] - src_indptr[idx];
  }
};

template <>
void TakeOpForwardCsrImpl<cpu>(const TakeParam& params,
                               const OpContext& ctx,
                               const TBlob& idx,
                               const NDArray& arr,
                               OpReqType req,
                               const NDArray& out) {
  using namespace csr;
  using namespace mxnet_op;
  using nnvm::dim_t;
  Stream<cpu>* s = ctx.get_stream<cpu>();
  if (req == kNullOp)
    return;
  if (!arr.storage_initialized()) {
    FillZerosCsrImpl(s, out);
    return;
  }
  CHECK_EQ(idx.shape_.ndim(), 1U) << "Take with CSR array only supports one-dimensional indices. "
                                  << idx.shape_.ndim() << " dimensional input is given instead";
  CHECK_EQ(req, kWriteTo) << "req = " << req << " is not supported for take(csr)";
  auto axis = params.axis;
  CHECK_EQ(axis, 0) << "axis = " << axis << " is not supported for take(csr)";
  CHECK(params.mode == take_::kClip || params.mode == take_::kWrap)
      << "mode = " << params.mode << " is not supported";
  const dim_t num_rows     = out.shape()[0];
  const dim_t max_num_rows = arr.shape()[0];
  out.CheckAndAllocAuxData(kIndPtr, {Shape1(num_rows + 1)});

  MSHADOW_TYPE_SWITCH(idx.type_flag_, IType, {
    MSHADOW_TYPE_SWITCH(arr.dtype(), DType, {
      MSHADOW_IDX_TYPE_SWITCH(out.aux_type(kIdx), RType, {
        RType* out_indptr       = out.aux_data(kIndPtr).dptr<RType>();
        const RType* src_indptr = arr.aux_data(kIndPtr).dptr<RType>();
        const IType* idx_ptr    = idx.dptr<IType>();
        // gather per row nnz information for output
        bool clip = params.mode == take_::kClip;
        if (clip) {
          Kernel<CsrTakeRowCountKernel<true>, cpu>::Launch(
              s, num_rows + 1, out_indptr, src_indptr, idx_ptr, max_num_rows);
        } else {
          Kernel<CsrTakeRowCountKernel<false>, cpu>::Launch(
              s, num_rows + 1, out_indptr, src_indptr, idx_ptr, max_num_rows);
        }
        // calculate prefix sum with single thread
        for (dim_t i = 0; i < num_rows; i++) {
          out_indptr[i + 1] += out_indptr[i];
        }
        // total number of non-zero rows
        const dim_t nnz = out_indptr[num_rows];
        if (nnz == 0) {
          FillZerosCsrImpl(s, out);
          return;
        }
        out.CheckAndAllocAuxData(kIdx, {Shape1(nnz)});
        out.CheckAndAllocData(Shape1(nnz));
        RType* out_idx        = out.aux_data(kIdx).dptr<RType>();
        DType* out_data       = out.data().dptr<DType>();
        const RType* src_idx  = arr.aux_data(kIdx).dptr<RType>();
        const DType* src_data = arr.data().dptr<DType>();
        // copy indices and data for output
        if (clip) {
          Kernel<CsrTakeDataKernel<true>, cpu>::Launch(s,
                                                       num_rows,
                                                       out_idx,
                                                       out_data,
                                                       out_indptr,
                                                       src_idx,
                                                       src_data,
                                                       src_indptr,
                                                       idx_ptr,
                                                       max_num_rows);
        } else {
          Kernel<CsrTakeDataKernel<false>, cpu>::Launch(s,
                                                        num_rows,
                                                        out_idx,
                                                        out_data,
                                                        out_indptr,
                                                        src_idx,
                                                        src_data,
                                                        src_indptr,
                                                        idx_ptr,
                                                        max_num_rows);
        }
      });
    });
  });
}

template <>
void TakeOpForward<cpu>(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
                        const std::vector<TBlob>& outputs) {
  using namespace mxnet_op;

  if (req[take_::kOut] == kNullOp)
    return;
  const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
  CHECK_EQ(inputs.size(), 2U);
  CHECK_EQ(outputs.size(), 1U);

  const mxnet::TShape& idxshape = inputs[take_::kIdx].shape_;
  const mxnet::TShape& arrshape = inputs[take_::kArr].shape_;
  const mxnet::TShape& oshape   = outputs[take_::kOut].shape_;

  if (idxshape.Size() == 0) {
    return;
  }

  Stream<cpu>* s        = ctx.get_stream<cpu>();
  const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 0);

  MSHADOW_TYPE_SWITCH_WITH_BOOL(outputs[take_::kOut].type_flag_, DType, {   // output data type
    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[take_::kIdx].type_flag_, IType, {  // index data type
      if (param.mode == take_::kRaise) {
        IType min = 0;
        IType max = static_cast<IType>(arrshape[actual_axis] - 1);
        // check with single thread is faster since data is small
        IType* idx_ptr  = inputs[take_::kIdx].dptr<IType>();
        size_t idx_size = idxshape.Size();
        bool is_valid   = CheckIndexOutOfBound(idx_ptr, idx_size, min, max);
        CHECK(is_valid) << "take operator contains indices out of bound";
      }
      if (actual_axis == 0) {
        if (param.mode == take_::kClip) {
          Kernel<TakeZeroAxisCPU<true>, cpu>::Launch(s,
                                                     idxshape.Size(),
                                                     outputs[take_::kOut].dptr<DType>(),
                                                     inputs[take_::kArr].dptr<DType>(),
                                                     inputs[take_::kIdx].dptr<IType>(),
                                                     oshape.Size() / idxshape.Size(),
                                                     arrshape[0]);
        } else {
          Kernel<TakeZeroAxisCPU<false>, cpu>::Launch(s,
                                                      idxshape.Size(),
                                                      outputs[take_::kOut].dptr<DType>(),
                                                      inputs[take_::kArr].dptr<DType>(),
                                                      inputs[take_::kIdx].dptr<IType>(),
                                                      oshape.Size() / idxshape.Size(),
                                                      arrshape[0]);
        }
      } else {
        mshadow::Shape<10> in_strides;
        index_t stride = 1;
        for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) {
          in_strides[i] = stride;
        }
        int outer_dimensions = 1;
        for (int i = 0; i < actual_axis; i++) {
          outer_dimensions *= oshape[i];
        }
        if (param.mode == take_::kClip) {
          Kernel<TakeNonzeroAxisCPU<true>, cpu>::Launch(s,
                                                        outer_dimensions,
                                                        outputs[take_::kOut].dptr<DType>(),
                                                        inputs[take_::kArr].dptr<DType>(),
                                                        inputs[take_::kIdx].dptr<IType>(),
                                                        in_strides[actual_axis - 1],
                                                        in_strides[actual_axis],
                                                        idxshape.Size(),
                                                        arrshape[actual_axis],
                                                        actual_axis);
        } else {
          Kernel<TakeNonzeroAxisCPU<false>, cpu>::Launch(s,
                                                         outer_dimensions,
                                                         outputs[take_::kOut].dptr<DType>(),
                                                         inputs[take_::kArr].dptr<DType>(),
                                                         inputs[take_::kIdx].dptr<IType>(),
                                                         in_strides[actual_axis - 1],
                                                         in_strides[actual_axis],
                                                         idxshape.Size(),
                                                         arrshape[actual_axis],
                                                         actual_axis);
        }
      }
    });
  });
}

template <>
inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
                                                  const OpContext& ctx,
                                                  const TBlob& ograd,
                                                  const TBlob& data,
                                                  const OpReqType req,
                                                  const NDArray& output) {
  using namespace mshadow;
  using namespace mxnet_op;
  using namespace mshadow::expr;
  using namespace rowsparse;
  using nnvm::dim_t;
  if (req == kNullOp)
    return;
  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
                          << "weight gradient calculation with req != write";

  // Request temporary storage for marking non-zero rows and prefix sum
  Stream<cpu>* s        = ctx.get_stream<cpu>();
  dim_t num_rows        = output.shape()[0];
  dim_t row_length      = output.shape()[1];
  size_t workspace_size = num_rows * sizeof(dim_t);
  Tensor<cpu, 1, char> workspace =
      ctx.requested[embedding::kTempSpace].get_space_typed<cpu, 1, char>(Shape1(workspace_size), s);
  dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
  // prefix sum array re-uses the row_flg array temp space
  dim_t* prefix_sum = row_flg;
  dim_t data_size   = static_cast<dim_t>(data.shape_.Size());

  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
        // check out of bound indices
        {
          IType min = 0;
          IType max = static_cast<IType>(output.shape()[0] - 1);
          // check with single thread is faster since data is small
          IType* data_ptr = data.dptr<IType>();
          bool is_valid   = CheckIndexOutOfBound(data_ptr, data.shape_.Size(), min, max);
          CHECK(is_valid) << "Embedding input contains data out of bound";
        }
        // mark row flags
        Fill<false>(s, TBlob(row_flg, Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
        Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
        // calculate inclusive prefix sum
        // TODO(haibin) ideally this is should be done in parallel
        prefix_sum[0] = row_flg[0];
        for (dim_t i = 1; i < num_rows; i++) {
          prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
        }
        // total number of non-zero rows
        dim_t nnr = prefix_sum[num_rows - 1];
        if (nnr == 0) {
          FillZerosRspImpl(s, output);
          return;
        }
        output.CheckAndAlloc({Shape1(nnr)});
        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
        // fill row_idx array of output matrix, using the row_flg values
        Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows, grad_row_idx, prefix_sum, num_rows);
        // prefill with zeros
        DType* grad_data = output.data().dptr<DType>();
        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), cpu::kDevMask), kWriteTo, 0);
        // add the final gradients
        const int num_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
        dim_t segment_len     = (nnr + num_threads - 1) / num_threads;
        Kernel<AddTakeGradRspKernel, cpu>::Launch(s,
                                                  num_threads,
                                                  grad_data,
                                                  prefix_sum,
                                                  ograd.dptr<DType>(),
                                                  row_length,
                                                  data.dptr<IType>(),
                                                  data_size,
                                                  segment_len,
                                                  num_rows);
      });
    });
  });
}

/*
 * \brief check if any of the indices is out of bound
 * \param s the stream
 * \param idx_ptr the indices on the stream
 * \param N the number of indices in an axis
 * \param M the number of axises to exmaine
 * \param mshape the array that stores shape for each dimension
 * \param is_valid_dim_ptr the temparary workspace that contains out-of-bound indices
 */
template <typename DType>
void GatherNDCheckBoundCPU(mshadow::Stream<cpu>* s,
                           const DType* idx_ptr,
                           index_t N,
                           index_t M,
                           const mshadow::Shape<10> mshape,
                           DType* is_valid_dim_ptr) {
  using namespace mxnet_op;
  Kernel<set_zero, cpu>::Launch(s, M, is_valid_dim_ptr);
  Kernel<is_valid_check_gather_nd, cpu>::Launch(s, M, is_valid_dim_ptr, idx_ptr, N, mshape);
  for (index_t m = 0; m < M; m++) {
    if (is_valid_dim_ptr[m] > mshape[m] - 1 || is_valid_dim_ptr[m] < -mshape[m]) {
      LOG(FATAL) << "IndexError: index " << is_valid_dim_ptr[m] << " is out of bounds for axis "
                 << m << " with size " << mshape[m];
    }
  }
}

void GatherNDForwardCPU(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
                        const std::vector<TBlob>& outputs) {
  using namespace mxnet_op;
  using namespace mshadow;
  CHECK_EQ(inputs.size(), 2U);
  CHECK_EQ(outputs.size(), 1U);
  if (req[0] == kNullOp)
    return;
  mshadow::Stream<cpu>* s     = ctx.get_stream<cpu>();
  const mxnet::TShape& dshape = inputs[0].shape_;
  const mxnet::TShape& ishape = inputs[1].shape_;
  index_t M                   = ishape[0];
  index_t N                   = ishape.Size() / M;
  index_t K                   = dshape.ProdShape(M, dshape.ndim());
  mshadow::Shape<10> strides;
  mshadow::Shape<10> mshape;
  for (index_t i = M - 1, stride = K; i >= 0; stride *= dshape[i], --i) {
    strides[i] = stride;
    mshape[i]  = dshape[i];
  }
  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {  // output data type switch
    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {          // indices data type switch
      // check whether indices are out of bound
      IType* idx_ptr = inputs[1].dptr<IType>();
      Tensor<cpu, 1, IType> workspace =
          ctx.requested[0].get_space_typed<cpu, 1, IType>(Shape1(M), s);
      IType* is_valid_dim_ptr = reinterpret_cast<IType*>(workspace.dptr_);
      GatherNDCheckBoundCPU(s, idx_ptr, N, M, mshape, is_valid_dim_ptr);
      Kernel<gather_nd, cpu>::Launch(s,
                                     N,
                                     req[0],
                                     N,
                                     M,
                                     K,
                                     strides,
                                     mshape,
                                     outputs[0].dptr<DType>(),
                                     inputs[0].dptr<DType>(),
                                     inputs[1].dptr<IType>());
    });
  });
}

template <typename DType, typename IType>
inline typename std::enable_if<(!std::is_same<DType, mshadow::half::half_t>::value), void>::type
GatherNDBackwardImpl(index_t N,
                     index_t M,
                     index_t K,
                     const mshadow::Shape<10> strides,
                     DType* out,
                     const DType* data,
                     const IType* indices,
                     mshadow::Stream<cpu>* s) {
#pragma omp parallel for
  for (index_t i = 0; i < N; i++) {
    index_t offset = 0;
    for (index_t j = 0; j < M; ++j) {
      offset += strides[j] * static_cast<index_t>(indices[j * N + i]);
    }
    for (index_t j = 0; j < K; ++j) {
#pragma omp atomic
      out[offset + j] += data[i * K + j];
    }
  }
}

template <typename DType, typename IType>
inline typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value, void>::type
GatherNDBackwardImpl(index_t N,
                     index_t M,
                     index_t K,
                     const mshadow::Shape<10> strides,
                     DType* out,
                     const DType* data,
                     const IType* indices,
                     mshadow::Stream<cpu>* s) {
  for (index_t i = 0; i < N; i++) {
    index_t offset = 0;
    for (index_t j = 0; j < M; ++j) {
      offset += strides[j] * static_cast<index_t>(indices[j * N + i]);
    }
    for (index_t j = 0; j < K; ++j) {
      out[offset + j] += data[i * K + j];
    }
  }
}

DMLC_REGISTER_PARAMETER(EmbeddingParam);
DMLC_REGISTER_PARAMETER(TakeParam);
DMLC_REGISTER_PARAMETER(OneHotParam);
DMLC_REGISTER_PARAMETER(ScatterNDParam);

NNVM_REGISTER_OP(Embedding)
MXNET_ADD_SPARSE_OP_ALIAS(Embedding)
    .add_alias("_npx_embedding")
    .describe(R"code(Maps integer indices to vector representations (embeddings).

This operator maps words to real-valued vectors in a high-dimensional space,
called word embeddings. These embeddings can capture semantic and syntactic properties of the words.
For example, it has been noted that in the learned embedding spaces, similar words tend
to be close to each other and dissimilar words far apart.

For an input array of shape (d1, ..., dK),
the shape of an output array is (d1, ..., dK, output_dim).
All the input values should be integers in the range [0, input_dim).

If the input_dim is ip0 and output_dim is op0, then shape of the embedding weight matrix must be
(ip0, op0).

When "sparse_grad" is False, if any index mentioned is too large, it is replaced by the index that
addresses the last vector in an embedding matrix.
When "sparse_grad" is True, an error will be raised if invalid indices are found.

Examples::

  input_dim = 4
  output_dim = 5

  // Each row in weight matrix y represents a word. So, y = (w0,w1,w2,w3)
  y = [[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.]]

  // Input array x represents n-grams(2-gram). So, x = [(w1,w3), (w0,w2)]
  x = [[ 1.,  3.],
       [ 0.,  2.]]

  // Mapped input x to its vector representation y.
  Embedding(x, y, 4, 5) = [[[  5.,   6.,   7.,   8.,   9.],
                            [ 15.,  16.,  17.,  18.,  19.]],

                           [[  0.,   1.,   2.,   3.,   4.],
                            [ 10.,  11.,  12.,  13.,  14.]]]


The storage type of weight can be either row_sparse or default.

.. Note::

    If "sparse_grad" is set to True, the storage type of gradient w.r.t weights will be
    "row_sparse". Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
    and Adam. Note that by default lazy updates is turned on, which may perform differently
    from standard updates. For more details, please check the Optimization API at:
    https://mxnet.apache.org/versions/master/api/python/docs/api/optimizer/index.html

)code" ADD_FILELINE)
    .set_num_inputs(2)
    .set_num_outputs(1)
    .set_attr_parser(ParamParser<EmbeddingParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"data", "weight"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", EmbeddingOpShape<EmbeddingParam>)
    .set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType<EmbeddingParam>)
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& attrs) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
    .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
    .set_attr<nnvm::FGradient>(
        "FGradient",
        [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
          return MakeNonlossGradNode(
              "_backward_Embedding", n, ograds, {n->inputs[0]}, n->attrs.dict);
        })
    .add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
    .add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
    .add_arguments(EmbeddingParam::__FIELDS__());

NNVM_REGISTER_OP(_backward_Embedding)
    .set_num_inputs(2)
    .set_num_outputs(2)
    .set_attr_parser(ParamParser<EmbeddingParam>)
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& attrs) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr<FInferStorageType>("FInferStorageType", EmbeddingOpBackwardStorageType)
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>)
    .set_attr<FComputeEx>("FComputeEx<cpu>", EmbeddingOpBackwardEx<cpu>);

NNVM_REGISTER_OP(take)
    .add_alias("_npi_take")
    .describe(R"code(Takes elements from an input array along the given axis.

This function slices the input array along a particular axis with the provided indices.

Given data tensor of rank r >= 1, and indices tensor of rank q, gather entries of the axis
dimension of data (by default outer-most one as axis=0) indexed by indices, and concatenates them
in an output tensor of rank q + (r - 1).

Examples::

  x = [4.  5.  6.]

  // Trivial case, take the second element along the first axis.

  take(x, [1]) = [ 5. ]

  // The other trivial case, axis=-1, take the third element along the first axis

  take(x, [3], axis=-1, mode='clip') = [ 6. ]

  x = [[ 1.,  2.],
       [ 3.,  4.],
       [ 5.,  6.]]

  // In this case we will get rows 0 and 1, then 1 and 2. Along axis 0

  take(x, [[0,1],[1,2]]) = [[[ 1.,  2.],
                             [ 3.,  4.]],

                            [[ 3.,  4.],
                             [ 5.,  6.]]]

  // In this case we will get rows 0 and 1, then 1 and 2 (calculated by wrapping around).
  // Along axis 1

  take(x, [[0, 3], [-1, -2]], axis=1, mode='wrap') = [[[ 1.  2.]
                                                       [ 2.  1.]]

                                                      [[ 3.  4.]
                                                       [ 4.  3.]]

                                                      [[ 5.  6.]
                                                       [ 6.  5.]]]

The storage type of ``take`` output depends upon the input storage type:

   - take(default, default) = default
   - take(csr, default, axis=0) = csr

)code" ADD_FILELINE)
    .set_num_inputs(2)
    .set_num_outputs(1)
    .set_attr_parser(ParamParser<TakeParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"a", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", TakeOpShape)
    .set_attr<nnvm::FInferType>("FInferType", TakeOpType)
    .set_attr<FInferStorageType>("FInferStorageType", TakeOpForwardStorageType)
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& attrs) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
    .set_attr<FCompute>("FCompute<cpu>", TakeOpForward<cpu>)
    .set_attr<FComputeEx>("FComputeEx<cpu>", TakeOpForwardEx<cpu>)
    .set_attr<nnvm::FGradient>("FGradient",
                               [](const nnvm::ObjectPtr& n,
                                  const std::vector<nnvm::NodeEntry>& ograds) {
                                 return MakeNonlossGradNode(
                                     "_backward_take", n, ograds, {n->inputs[1]}, n->attrs.dict);
                               })
    .add_argument("a", "NDArray-or-Symbol", "The input array.")
    .add_argument("indices", "NDArray-or-Symbol", "The indices of the values to be extracted.")
    .add_arguments(TakeParam::__FIELDS__());

NNVM_REGISTER_OP(_backward_take)
    .set_num_inputs(2)
    .set_num_outputs(2)
    .set_attr_parser(ParamParser<TakeParam>)
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& attrs) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .set_attr<FCompute>("FCompute<cpu>", TakeOpBackward<cpu>);

NNVM_REGISTER_OP(batch_take)
    .describe(R"code(Takes elements from a data batch.

.. note::
  `batch_take` is deprecated. Use `pick` instead.

Given an input array of shape ``(d0, d1)`` and indices of shape ``(i0,)``, the result will be
an output array of shape ``(i0,)`` with::

  output[i] = input[i, indices[i]]

Examples::

  x = [[ 1.,  2.],
       [ 3.,  4.],
       [ 5.,  6.]]

  // takes elements with specified indices
  batch_take(x, [0,1,0]) = [ 1.  4.  5.]

)code" ADD_FILELINE)
    .set_num_outputs(1)
    .set_num_inputs(2)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"a", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", BatchTakeOpShape)
    .set_attr<nnvm::FInferType>("FInferType", BatchTakeOpType)
    .set_attr<FCompute>("FCompute<cpu>", BatchTakeOpForward<cpu>)
    .add_argument("a", "NDArray-or-Symbol", "The input array")
    .add_argument("indices", "NDArray-or-Symbol", "The index array");

NNVM_REGISTER_OP(one_hot)
    .add_alias("_npx_one_hot")
    .describe(R"code(Returns a one-hot array.

The locations represented by `indices` take value `on_value`, while all
other locations take value `off_value`.

`one_hot` operation with `indices` of shape ``(i0, i1)`` and `depth`  of ``d`` would result
in an output array of shape ``(i0, i1, d)`` with::

  output[i,j,:] = off_value
  output[i,j,indices[i,j]] = on_value

Examples::

  one_hot([1,0,2,0], 3) = [[ 0.  1.  0.]
                           [ 1.  0.  0.]
                           [ 0.  0.  1.]
                           [ 1.  0.  0.]]

  one_hot([1,0,2,0], 3, on_value=8, off_value=1,
          dtype='int32') = [[1 8 1]
                            [8 1 1]
                            [1 1 8]
                            [8 1 1]]

  one_hot([[1,0],[1,0],[2,0]], 3) = [[[ 0.  1.  0.]
                                      [ 1.  0.  0.]]

                                     [[ 0.  1.  0.]
                                      [ 1.  0.  0.]]

                                     [[ 0.  0.  1.]
                                      [ 1.  0.  0.]]]
)code" ADD_FILELINE)
    .set_num_outputs(1)
    .set_num_inputs(1)
    .set_attr_parser(ParamParser<OneHotParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", OneHotOpShape)
    .set_attr<nnvm::FInferType>("FInferType", OneHotOpType)
    .set_attr<FCompute>("FCompute<cpu>", OneHotOpForward<cpu>)
    .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
    .add_argument("indices", "NDArray-or-Symbol", "array of locations where to set on_value")
    .add_arguments(OneHotParam::__FIELDS__());

NNVM_REGISTER_OP(gather_nd)
    .add_alias("_npi_gather_nd")
    .add_alias("_npx_gather_nd")
    .describe(R"code(Gather elements or slices from `data` and store to a tensor whose
shape is defined by `indices`.

Given `data` with shape `(X_0, X_1, ..., X_{N-1})` and indices with shape
`(M, Y_0, ..., Y_{K-1})`, the output will have shape `(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1})`,
where `M <= N`. If `M == N`, output shape will simply be `(Y_0, ..., Y_{K-1})`.

The elements in output is defined as follows::

  output[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}] = data[indices[0, y_0, ..., y_{K-1}],
                                                      ...,
                                                      indices[M-1, y_0, ..., y_{K-1}],
                                                      x_M, ..., x_{N-1}]

Examples::

  data = [[0, 1], [2, 3]]
  indices = [[1, 1, 0], [0, 1, 0]]
  gather_nd(data, indices) = [2, 3, 0]

  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
  indices = [[0, 1], [1, 0]]
  gather_nd(data, indices) = [[3, 4], [5, 6]]

)code")
    .set_num_outputs(1)
    .set_num_inputs(2)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"data", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", GatherNDShape)
    .set_attr<nnvm::FInferType>("FInferType", GatherNDType)
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& attrs) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr<FCompute>("FCompute<cpu>", GatherNDForwardCPU)
    .set_attr<nnvm::FGradient>(
        "FGradient",
        [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
          auto p        = nnvm::Node::Create();
          p->attrs.op   = nnvm::Op::Get("_backward_gather_nd");
          p->attrs.name = n->attrs.name + "_backward";
          p->inputs.push_back(ograds[0]);
          p->inputs.push_back(n->inputs[1]);
          p->control_deps.emplace_back(n);
          auto zero = MakeNode(
              "zeros_like", n->attrs.name + "_backward_indices", {n->inputs[1]}, nullptr, &n);

          std::vector<nnvm::NodeEntry> ret;
          ret.emplace_back(p);
          ret.emplace_back(zero);
          return ret;
        })
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .add_argument("data", "NDArray-or-Symbol", "data")
    .add_argument("indices", "NDArray-or-Symbol", "indices");

NNVM_REGISTER_OP(scatter_nd)
    .describe(R"code(Scatters data into a new tensor according to indices.

Given `data` with shape `(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1})` and indices with shape
`(M, Y_0, ..., Y_{K-1})`, the output will have shape `(X_0, X_1, ..., X_{N-1})`,
where `M <= N`. If `M == N`, data shape should simply be `(Y_0, ..., Y_{K-1})`.

The elements in output is defined as follows::

  output[indices[0, y_0, ..., y_{K-1}],
         ...,
         indices[M-1, y_0, ..., y_{K-1}],
         x_M, ..., x_{N-1}] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]

all other entries in output are 0.

.. warning::

    If the indices have duplicates, the result will be non-deterministic and
    the gradient of `scatter_nd` will not be correct!!


Examples::

  data = [2, 3, 0]
  indices = [[1, 1, 0], [0, 1, 0]]
  shape = (2, 2)
  scatter_nd(data, indices, shape) = [[0, 0], [2, 3]]

  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
  indices = [[0, 1], [1, 1]]
  shape = (2, 2, 2, 2)
  scatter_nd(data, indices, shape) = [[[[0, 0],
                                        [0, 0]],

                                       [[1, 2],
                                        [3, 4]]],

                                      [[[0, 0],
                                        [0, 0]],

                                       [[5, 6],
                                        [7, 8]]]]

)code")
    .set_num_outputs(1)
    .set_num_inputs(2)
    .set_attr_parser(ParamParser<ScatterNDParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"data", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", ScatterNDShape)
    .set_attr<nnvm::FInferType>("FInferType", ScatterNDType)
    .set_attr<FCompute>("FCompute<cpu>", ScatterNDForward<cpu>)
    .set_attr<nnvm::FGradient>(
        "FGradient",
        [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
          auto p        = nnvm::Node::Create();
          p->attrs.op   = nnvm::Op::Get("gather_nd");
          p->attrs.name = n->attrs.name + "_backward";
          p->inputs.push_back(ograds[0]);
          p->inputs.push_back(n->inputs[1]);
          p->control_deps.emplace_back(n);
          auto zero = MakeNode(
              "zeros_like", n->attrs.name + "_backward_indices", {n->inputs[1]}, nullptr, &n);
          std::vector<nnvm::NodeEntry> ret;
          ret.emplace_back(p);
          ret.emplace_back(zero);
          return ret;
        })
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .add_argument("data", "NDArray-or-Symbol", "data")
    .add_argument("indices", "NDArray-or-Symbol", "indices")
    .add_arguments(ScatterNDParam::__FIELDS__());

NNVM_REGISTER_OP(_backward_gather_nd)
    .describe(R"code(Accumulates data according to indices and get the result. It's the backward of
`gather_nd`.

Given `data` with shape `(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1})` and indices with shape
`(M, Y_0, ..., Y_{K-1})`, the output will have shape `(X_0, X_1, ..., X_{N-1})`,
where `M <= N`. If `M == N`, data shape should simply be `(Y_0, ..., Y_{K-1})`.

The elements in output is defined as follows::

  output[indices[0, y_0, ..., y_{K-1}],
         ...,
         indices[M-1, y_0, ..., y_{K-1}],
         x_M, ..., x_{N-1}] += data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]

all other entries in output are 0 or the original value if AddTo is triggered.

Examples::

  data = [2, 3, 0]
  indices = [[1, 1, 0], [0, 1, 0]]
  shape = (2, 2)
  _backward_gather_nd(data, indices, shape) = [[0, 0], [2, 3]] # Same as scatter_nd

  # The difference between scatter_nd and scatter_nd_acc is the latter will accumulate
  #  the values that point to the same index.

  data = [2, 3, 0]
  indices = [[1, 1, 0], [1, 1, 0]]
  shape = (2, 2)
  _backward_gather_nd(data, indices, shape) = [[0, 0], [0, 5]]

)code")
    .set_num_outputs(1)
    .set_num_inputs(2)
    .set_attr_parser(ParamParser<ScatterNDParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"data", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>("FInferShape", ScatterNDShape)
    .set_attr<nnvm::FInferType>("FInferType", ScatterNDType)
    .set_attr<FCompute>("FCompute<cpu>", GatherNDBackward<cpu>)
    .set_attr<nnvm::FGradient>(
        "FGradient",
        [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
          auto p        = nnvm::Node::Create();
          p->attrs.op   = nnvm::Op::Get("gather_nd");
          p->attrs.name = n->attrs.name + "_backward";
          p->inputs.push_back(ograds[0]);
          p->inputs.push_back(n->inputs[1]);
          p->control_deps.emplace_back(n);
          auto zero = MakeNode(
              "zeros_like", n->attrs.name + "_backward_indices", {n->inputs[1]}, nullptr, &n);
          std::vector<nnvm::NodeEntry> ret;
          ret.emplace_back(p);
          ret.emplace_back(zero);
          return ret;
        })
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .add_argument("data", "NDArray-or-Symbol", "data")
    .add_argument("indices", "NDArray-or-Symbol", "indices")
    .add_arguments(ScatterNDParam::__FIELDS__());

NNVM_REGISTER_OP(_scatter_set_nd)
    .add_alias("_npi_scatter_set_nd")
    .describe(R"code(This operator has the same functionality as scatter_nd
except that it does not reset the elements not indexed by the input
index `NDArray` in the input data `NDArray`. output should be explicitly
given and be the same as lhs.

.. note:: This operator is for internal use only.

Examples::

  data = [2, 3, 0]
  indices = [[1, 1, 0], [0, 1, 0]]
  out = [[1, 1], [1, 1]]
  _scatter_set_nd(lhs=out, rhs=data, indices=indices, out=out)
  out = [[0, 1], [2, 3]]

)code")
    .set_num_outputs(1)
    .set_num_inputs(3)
    .set_attr_parser(ParamParser<ScatterNDParam>)
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"lhs", "rhs", "indices"};
                                     })
    .set_attr<mxnet::FInferShape>(
        "FInferShape",
        [](const nnvm::NodeAttrs& attrs,
           mxnet::ShapeVector* in_attrs,
           mxnet::ShapeVector* out_attrs) {
          CHECK_EQ(in_attrs->size(), 3U);
          CHECK_EQ(out_attrs->size(), 1U);
          SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
          SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
          mxnet::ShapeVector tmp_in_attrs = {in_attrs->at(1), in_attrs->at(2)};
          if (!ScatterNDShape(attrs, &tmp_in_attrs, out_attrs)) {
            return false;
          }
          SHAPE_ASSIGN_CHECK(*in_attrs, 1, tmp_in_attrs[0]);
          SHAPE_ASSIGN_CHECK(*in_attrs, 2, tmp_in_attrs[1]);
          SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
          return true;
        })
    .set_attr<nnvm::FInferType>(
        "FInferType",
        [](const nnvm::NodeAttrs& attrs, std::vector<int>* in_attrs, std::vector<int>* out_attrs) {
          CHECK_EQ(in_attrs->size(), 3U);
          CHECK_EQ(out_attrs->size(), 1U);
          TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
          TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
          std::vector<int> tmp_in_attrs = {in_attrs->at(1), in_attrs->at(2)};
          if (!ScatterNDType(attrs, &tmp_in_attrs, out_attrs)) {
            return false;
          }
          TYPE_ASSIGN_CHECK(*in_attrs, 1, tmp_in_attrs[0]);
          TYPE_ASSIGN_CHECK(*in_attrs, 2, tmp_in_attrs[1]);
          TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
          return true;
        })
    .set_attr<FCompute>("FCompute<cpu>", ScatterSetNDForward<cpu>)
    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                    [](const NodeAttrs& attrs) {
                                      return std::vector<std::pair<int, int> >{{0, 0}};
                                    })
    .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
                                      [](const NodeAttrs& attrs) {
                                        return std::vector<bool>{true};
                                      })
    .add_argument("lhs", "NDArray-or-Symbol", "source input")
    .add_argument("rhs", "NDArray-or-Symbol", "value to assign")
    .add_argument("indices", "NDArray-or-Symbol", "indices")
    .add_arguments(ScatterNDParam::__FIELDS__());

}  // namespace op
}  // namespace mxnet