From d621e50862a96d259135fcfac0098f7709ee0f00 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Fri, 29 May 2020 14:51:17 -0700
Subject: [PATCH] Revert PR 17767 for fixing GPU memory usage regression
 (#18283) (#18309)

* Revert "Fix and optimize handling of vectorized memory accesses (#17767)"

This reverts commit 5542d03695b4a2589afb88acf128d4ba8ac94d0d.

* add license to reverted file
---
 3rdparty/mshadow/mshadow/base.h               |  48 +++
 3rdparty/mshadow/mshadow/half2.h              | 162 +++++++++
 src/common/cuda_vectorization.cuh             | 283 ---------------
 src/operator/mshadow_op.h                     |  67 ++++
 src/operator/tensor/elemwise_binary_op.cuh    | 322 ------------------
 src/operator/tensor/elemwise_binary_op.h      | 206 +++++------
 .../tensor/elemwise_binary_op_basic.cu        |  23 +-
 .../tensor/elemwise_binary_scalar_op.cuh      | 207 -----------
 .../tensor/elemwise_binary_scalar_op.h        |  75 +---
 .../tensor/elemwise_binary_scalar_op_basic.cu |   9 +-
 .../elemwise_binary_scalar_op_extended.cu     |  15 +-
 src/operator/tensor/elemwise_sum.cu           | 112 +-----
 src/operator/tensor/elemwise_sum.h            |  12 +
 src/operator/tensor/elemwise_unary_op.cuh     | 127 -------
 src/operator/tensor/elemwise_unary_op.h       |  56 ++-
 .../tensor/elemwise_unary_op_basic.cu         |   1 -
 src/operator/tensor/elemwise_unary_op_pow.cu  |   1 -
 src/operator/tensor/elemwise_unary_op_trig.cu |   1 -
 tests/python/unittest/test_operator.py        |  81 +----
 19 files changed, 464 insertions(+), 1344 deletions(-)
 create mode 100755 3rdparty/mshadow/mshadow/half2.h
 delete mode 100644 src/common/cuda_vectorization.cuh
 delete mode 100644 src/operator/tensor/elemwise_binary_op.cuh
 delete mode 100644 src/operator/tensor/elemwise_binary_scalar_op.cuh
 delete mode 100644 src/operator/tensor/elemwise_unary_op.cuh

diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index 6469bbc34f37..9f538574f093 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -295,6 +295,7 @@ extern "C" {
   }
 
 #include "./half.h"
+#include "./half2.h"
 #include "./bfloat.h"
 #define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP)                                               \
   MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
@@ -409,6 +410,11 @@ struct DataType<half::half_t> {
 #endif
 };
 template<>
+struct DataType<half::half2_t> {
+  static const int kFlag = kFloat16;
+  static const int kLanes = 2;
+};
+template<>
 struct DataType<bfloat::bf16_t> {
   static const int kFlag = kBfloat16;
   static const int kLanes = 1;
@@ -1161,6 +1167,48 @@ struct minimum {
   }
 #endif
 
+#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...)  \
+  switch (type) {                                         \
+  case mshadow::kFloat32:                                 \
+    {                                                     \
+      typedef float DType;                                \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat64:                                 \
+    {                                                     \
+      typedef double DType;                               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kFloat16:                                 \
+    {                                                     \
+      typedef mshadow::half::half2_t DType;               \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kUint8:                                   \
+    {                                                     \
+      typedef uint8_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt32:                                   \
+    {                                                     \
+      typedef int32_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  case mshadow::kInt64:                                   \
+    {                                                     \
+      typedef int64_t DType;                              \
+      {__VA_ARGS__}                                       \
+    }                                                     \
+    break;                                                \
+  default:                                                \
+    LOG(FATAL) << "Unknown type enum " << type;           \
+  }
+
 #define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...)  \
   switch (type) {                                      \
   case mshadow::kFloat32:                              \
diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
new file mode 100755
index 000000000000..cecc5449383c
--- /dev/null
+++ b/3rdparty/mshadow/mshadow/half2.h
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file half2.h
+ * \brief definition of vector float16, half2 type.
+ *
+ * \author Antti-Pekka Hynninen
+ */
+#ifndef MSHADOW_HALF2_H_
+#define MSHADOW_HALF2_H_
+
+#if (defined(__CUDACC__) && __CUDA_ARCH__ >= 530 && MSHADOW_USE_CUDA && CUDA_VERSION >= 7050)
+  #define MSHADOW_CUDA_HALF2 1
+  #include <cuda_fp16.h>
+#else
+  #define MSHADOW_CUDA_HALF2 0
+#endif
+
+#include<math.h>
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+/* \brief name space for host/device portable half-precision floats */
+namespace half {
+
+#define MSHADOW_HALF2_ASSIGNOP(AOP, OP)                                   \
+  template<typename T>                                                    \
+  MSHADOW_XINLINE half2_t operator AOP (const T& a) {                     \
+    return *this = half2_t(*this OP a);  /* NOLINT(*)*/                   \
+  }                                                                       \
+
+class MSHADOW_ALIGNED(4) half2_t {
+ public:
+#if MSHADOW_CUDA_HALF2
+  half2 half2_;
+#else
+  half_t half_t2[2];
+#endif
+
+  MSHADOW_XINLINE half2_t() {}
+
+#if MSHADOW_CUDA_HALF2
+  MSHADOW_XINLINE explicit half2_t(half2 a) : half2_(a) {}
+#else
+  MSHADOW_XINLINE explicit half2_t(half_t a, half_t b) {
+    half_t2[0] = a;
+    half_t2[1] = b;
+  }
+#endif
+
+  MSHADOW_XINLINE explicit half2_t(int a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = __half2half2(__int2half_rz(a));
+#else
+    half_t2[0] = (half_t)a;
+    half_t2[1] = (half_t)a;
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator+() {
+    return *this;
+  }
+
+  MSHADOW_XINLINE half2_t operator-() {
+#if MSHADOW_CUDA_HALF2
+    return half2_t(__hneg2(half2_));
+#else
+    return half2_t(-half_t2[0], -half_t2[1]);
+#endif
+  }
+
+  MSHADOW_XINLINE half2_t operator=(const half2_t& a) {
+#if MSHADOW_CUDA_HALF2
+    half2_ = a.half2_;
+#else
+    half_t2[0] = a.half_t2[0];
+    half_t2[1] = a.half_t2[1];
+#endif
+    return a;
+  }
+
+  MSHADOW_HALF2_ASSIGNOP(+=, +)
+  MSHADOW_HALF2_ASSIGNOP(-=, -)
+  MSHADOW_HALF2_ASSIGNOP(*=, *)
+  MSHADOW_HALF2_ASSIGNOP(/=, /)
+};
+
+/*! \brief overloaded + operator for half2_t */
+MSHADOW_XINLINE half2_t operator+(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) + __low2float(b.half2_),
+                                   __high2float(a.half2_) + __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] + b.half_t2[0], a.half_t2[1] + b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded - operator for half2_t */
+MSHADOW_XINLINE half2_t operator-(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) - __low2float(b.half2_),
+                                   __high2float(a.half2_) - __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] - b.half_t2[0], a.half_t2[1] - b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded * operator for half2_t */
+MSHADOW_XINLINE half2_t operator*(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) * __low2float(b.half2_),
+                                   __high2float(a.half2_) * __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] * b.half_t2[0], a.half_t2[1] * b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded / operator for half2_t */
+MSHADOW_XINLINE half2_t operator/(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(__low2float(a.half2_) / __low2float(b.half2_),
+                                   __high2float(a.half2_) / __high2float(b.half2_)));
+#else
+  return half2_t(a.half_t2[0] / b.half_t2[0], a.half_t2[1] / b.half_t2[1]);
+#endif
+}
+/*! \brief overloaded % operator for half2_t */
+MSHADOW_XINLINE half2_t operator%(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return half2_t(__floats2half2_rn(::fmod(__low2float(a.half2_), __low2float(b.half2_)),
+                                   ::fmod(__high2float(a.half2_), __high2float(b.half2_))));
+#else
+  return half2_t(::fmod(a.half_t2[0], b.half_t2[0]), ::fmod(a.half_t2[1], b.half_t2[1]));
+#endif
+}
+/*! \brief overloaded == operator for half2_t */
+MSHADOW_XINLINE bool operator==(half2_t a, half2_t b) {
+#if MSHADOW_CUDA_HALF2
+  return __hbeq2(a.half2_, b.half2_);
+#else
+  return (a.half_t2[0] == b.half_t2[0] && a.half_t2[1] == b.half_t2[1]);
+#endif
+}
+
+}  // namespace half
+}  // namespace mshadow
+#endif  // MSHADOW_HALF2_H_
diff --git a/src/common/cuda_vectorization.cuh b/src/common/cuda_vectorization.cuh
deleted file mode 100644
index 7803afb901ab..000000000000
--- a/src/common/cuda_vectorization.cuh
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file cuda_vectorization.cuh
- * \brief GPU helpers for vectorized memory accesses
- */
-
-#ifndef MXNET_COMMON_CUDA_VECTORIZATION_CUH_
-#define MXNET_COMMON_CUDA_VECTORIZATION_CUH_
-
-#if MXNET_USE_CUDA && __CUDACC__
-
-#include <cuda_runtime.h>
-#include "cuda_utils.h"
-
-
-namespace mxnet {
-namespace common {
-namespace cuda {
-
-/* \brief Helper class that enables storing multiple values of type DType
-          as 1 value of type LType.
-*/
-template <typename DType, typename LType>
-class VectorizedStorage {
- public:
-  constexpr static int nvec = sizeof(LType) / sizeof(DType);
-  union vectorized_storage {
-    LType aligned;
-    DType separate[nvec];  // NOLINT(*)
-
-    MSHADOW_XINLINE vectorized_storage() {}
-    MSHADOW_XINLINE ~vectorized_storage() {}
-  } scratch_;
-};
-
-/* \brief Helper class that enables accessing multiple values of type DType
-          as 1 value of type LType. Additional aligned template argument
-          allows performance optimizations if the pointer and the size of
-          the allocation is aligned to sizeof(LType) / sizeof(DType) elements.
-*/
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedAccessor {
- public:
-  using StorageType = VectorizedStorage<typename std::remove_const<DType>::type,
-                                        typename std::remove_const<LType>::type>;
-  StorageType storage_;
-
-  LType* aligned_ptr_;
-  DType* unaligned_ptr_;
-  int alignment_;
-  index_t n_elems_;
-
-  MSHADOW_XINLINE VectorizedAccessor(DType* ptr, const index_t size) {
-    unaligned_ptr_ = ptr;
-    if (aligned) {
-      alignment_ = 0;
-      aligned_ptr_ = reinterpret_cast<LType*>(ptr);
-      n_elems_ = (size + storage_.nvec - 1) / storage_.nvec;
-    } else {
-      size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
-      alignment_ = (ptr_as_number % sizeof(LType)) / sizeof(DType);
-      aligned_ptr_ = reinterpret_cast<LType*>(ptr - alignment_);
-      n_elems_ = (size + alignment_ + storage_.nvec - 1) / storage_.nvec;
-    }
-  }
-
-  /* \brief Alignment of the input pointer in elements. */
-  MSHADOW_XINLINE int alignment() const {
-    return alignment_;
-  }
-
-  /* \brief Access to separate elements. */
-  MSHADOW_XINLINE DType* separate() {
-    return storage_.scratch_.separate;
-  }
-
-  /* \brief Number of elements stored. */
-  MSHADOW_XINLINE constexpr int nvec() const {
-    return storage_.nvec;
-  }
-
-  /* \brief Number of aligned elements that span the entire input tensor. */
-  MSHADOW_XINLINE index_t num_aligned_elements() const {
-    return n_elems_;
-  }
-
-  /* \brief Load values from the input.
-     \param id Aligned index of the element.
-     \param N size of the tensor.
-  */
-  MSHADOW_XINLINE void load(const index_t id, const index_t N) {
-    if (aligned) {
-      storage_.scratch_.aligned = aligned_ptr_[id];
-    } else {
-      if (id > 0 && id < n_elems_ - 1) {
-        storage_.scratch_.aligned = aligned_ptr_[id];
-      } else {
-#pragma unroll
-        for (int j = 0; j < storage_.nvec; ++j) {
-          DType* ptr = reinterpret_cast<DType*>(&(aligned_ptr_[id])) + j;
-          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(unaligned_ptr_) &&
-              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(unaligned_ptr_ + N)) {
-            storage_.scratch_.separate[j] = *ptr;
-          }
-        }
-      }
-    }
-  }
-};
-
-/* \brief Class used for vectorized read-only access. */
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedLoader : public VectorizedAccessor<const DType, const LType, aligned> {
- public:
-  MSHADOW_XINLINE VectorizedLoader(const DType* ptr, const index_t N) :
-    VectorizedAccessor<const DType, const LType, aligned>(ptr, N) {
-  }
-};
-
-/* \brief Class used for vectorized writable access. */
-template <typename DType, typename LType, bool aligned = false>
-class VectorizedStorer : public VectorizedAccessor<DType, LType, aligned> {
- public:
-  MSHADOW_XINLINE VectorizedStorer(DType* ptr, const index_t N) :
-    VectorizedAccessor<DType, LType, aligned>(ptr, N) {
-  }
-
-  /* \brief Store values to the output.
-     \param id Aligned index of the element.
-     \param N size of the tensor.
-  */
-  MSHADOW_XINLINE void store(const index_t id, const index_t N) {
-    if (aligned) {
-      this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
-    } else {
-      if (id > 0 && id < this->n_elems_ - 1) {
-        this->aligned_ptr_[id] = this->storage_.scratch_.aligned;
-      } else {
-#pragma unroll
-        for (int j = 0; j < this->storage_.nvec; ++j) {
-          DType* ptr = reinterpret_cast<DType*>(&(this->aligned_ptr_[id])) + j;
-          if (reinterpret_cast<size_t>(ptr) >= reinterpret_cast<size_t>(this->unaligned_ptr_) &&
-              reinterpret_cast<size_t>(ptr) < reinterpret_cast<size_t>(this->unaligned_ptr_ + N)) {
-            *ptr = this->storage_.scratch_.separate[j];
-          }
-        }
-      }
-    }
-  }
-};
-
-namespace {
-
-enum class Alignment {
-  SAME_ALIGNED,  // All tensors aligned
-  SAME_UNALIGNED,  // All tensors have the same misalignment
-  DIFFERENT  // Tensors have different alignment
-};
-
-template <typename LType, typename DType>
-int CalcAlignment(const DType* ptr) {
-  size_t ptr_as_number = reinterpret_cast<size_t>(ptr);
-  return ptr_as_number % sizeof(LType);
-}
-
-/* \brief Check alignment of the inputs and outputs when cast to LType*.
-   \param params Structuce containing arrays with inputs' and outputs' pointers
-   \param lead_dim Leading dimension of the tensors.
-   \param other_dim The size of the other dimensions of the tensors.
-*/
-template <typename LType, typename DType, typename Params>
-Alignment CheckAlignment(const Params& params, const index_t lead_dim, const index_t other_dim) {
-  int align = -1;
-  constexpr int nvec = sizeof(LType) / sizeof(DType);
-
-  for (const DType* ptr : params.inputs) {
-    int new_align = CalcAlignment<LType>(ptr);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
-      }
-    }
-  }
-
-  for (const DType* ptr : params.outputs) {
-    int new_align = CalcAlignment<LType>(ptr);
-    if (align == -1) {
-      align = new_align;
-    } else {
-      if (align != new_align) {
-        return Alignment::DIFFERENT;
-      }
-    }
-  }
-
-  if ((other_dim != 1) &&
-      (lead_dim % nvec != 0)) {
-    return Alignment::DIFFERENT;
-  }
-
-  if ((align == 0) &&
-      (lead_dim % nvec == 0)) {
-    return Alignment::SAME_ALIGNED;
-  } else {
-    return Alignment::SAME_UNALIGNED;
-  }
-}
-
-constexpr int vectorized_kernel_thread_num = 512;
-
-}  // namespace
-
-/* \brief Helper launcher function for the vectorized kernels. Checks for alignment of the
-          input and output tensors and launches a proper template.
-   \param lead_dim Leading dimension of the tensors.
-   \param other_dim The size of the other dimensions.
-   \param s Stream which should be used for launching the kernel.
-   \param params Input parameters to the kernel. Needs to contain at least 2 arrays of DType*:
-                 inputs and outputs, which contain input and output pointers.
-*/
-template <typename DType, typename LType, typename Kernel>
-void VectorizedKernelLauncher(const index_t lead_dim,
-                              const index_t other_dim,
-                              mshadow::Stream<gpu>* s,
-                              typename Kernel::ParamType params) {
-  static_assert(sizeof(LType) >= sizeof(DType), "Load type is smaller than operand type");
-  if (lead_dim * other_dim != 0) {
-    cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-    VectorizedLoader<DType, LType> l(params.inputs[0], lead_dim);
-    size_t num_elements = other_dim * l.num_aligned_elements();
-    constexpr int threads = vectorized_kernel_thread_num;
-    constexpr int max_blocks = 65535;
-    index_t blocks = std::min(static_cast<int>((num_elements + threads - 1) / threads),
-                              max_blocks);
-    auto align = CheckAlignment<LType, DType>(params, lead_dim, other_dim);
-    switch (align) {
-      case Alignment::SAME_ALIGNED:
-        Kernel::template Launch<true, LType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      case Alignment::SAME_UNALIGNED:
-        Kernel::template Launch<false, LType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      case Alignment::DIFFERENT: {
-        const index_t size = lead_dim * other_dim;
-        index_t blocks = std::min(static_cast<int>((size + threads - 1) /
-                                                   threads),
-                                  max_blocks);
-        // If the pointers are aligned differently we cannot vectorize
-        Kernel::template Launch<true, DType>(blocks, threads, stream, params, lead_dim, other_dim);
-        break;
-      }
-    }
-  }
-}
-
-}  // namespace cuda
-}  // namespace common
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA && __CUDACC__
-
-#endif  // MXNET_COMMON_CUDA_VECTORIZATION_CUH_
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index e0bbb4e5c935..2d4d49254676 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -730,8 +730,22 @@ MXNET_BINARY_MATH_OP(rminus, b - a);
 
 MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b));
 
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t div_grad::Map<mshadow::half::half2_t>
+                                               (mshadow::half::half2_t a,
+                                                mshadow::half::half2_t b) {
+  return mshadow::half::half2_t(1) / b;
+}
+
 MXNET_BINARY_MATH_OP(div_rgrad, -math::id(a) / math::sqr(b));
 
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t div_rgrad::Map<mshadow::half::half2_t>
+                                               (mshadow::half::half2_t a,
+                                                mshadow::half::half2_t b) {
+  return -a / (b * b);
+}
+
 MXNET_BINARY_MATH_OP(rdiv, math::id(b) / math::id(a));
 
 MXNET_BINARY_MATH_OP(rdiv_grad, -math::id(b) / math::sqr(a));
@@ -782,6 +796,13 @@ struct mod : public mxnet_op::tunable {
 };
 
 
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
+                                               (mshadow::half::half2_t a,
+                                                mshadow::half::half2_t b) {
+  return a%b;
+}
+
 struct mod_grad : public mxnet_op::tunable  {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
@@ -803,6 +824,19 @@ MSHADOW_XINLINE mshadow::half::half_t mod_grad::Map<mshadow::half::half_t>
                                                     mshadow::half::half_t b) {
   return mshadow::half::half_t(1.0f);
 }
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t mod_grad::Map<mshadow::half::half2_t>
+                                                    (mshadow::half::half2_t a,
+                                                     mshadow::half::half2_t b) {
+  mshadow::half::half2_t result = mshadow::half::half2_t();
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
+  result.half2_ = ::__float2half2_rn(1.0f);
+#else
+  result.half_t2[0] = mshadow::half::half_t(0.0f);
+  result.half_t2[1] = mshadow::half::half_t(1.0f);
+#endif
+  return result;
+}
 
 struct mod_rgrad : public mxnet_op::tunable {
   template<typename DType>
@@ -825,6 +859,19 @@ MSHADOW_XINLINE mshadow::half::half_t mod_rgrad::Map<mshadow::half::half_t>
                                                      mshadow::half::half_t b) {
   return mshadow::half::half_t(-::floorf(static_cast<float>(a/b)));
 }
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t mod_rgrad::Map<mshadow::half::half2_t>
+                                                     (mshadow::half::half2_t a,
+                                                      mshadow::half::half2_t b) {
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
+  return mshadow::half::half2_t(__hneg2(::h2floor((a/b).half2_)));
+#else
+  return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
+                                  static_cast<float>(a.half_t2[0]/b.half_t2[0]))),
+                                mshadow::half::half_t(-::floorf(
+                                  static_cast<float>(a.half_t2[1]/b.half_t2[1]))));
+#endif
+}
 
 struct rmod : public mxnet_op::tunable {
   template<typename DType>
@@ -861,6 +908,13 @@ struct rmod : public mxnet_op::tunable {
   }
 };
 
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
+                                                (mshadow::half::half2_t a,
+                                                 mshadow::half::half2_t b) {
+  return b%a;
+}
+
 struct rmod_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
@@ -882,6 +936,19 @@ MSHADOW_XINLINE mshadow::half::half_t rmod_grad::Map<mshadow::half::half_t>
                                                     mshadow::half::half_t b) {
   return mshadow::half::half_t(-::floorf(static_cast<float>(b/a)));
 }
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t rmod_grad::Map<mshadow::half::half2_t>
+                                                     (mshadow::half::half2_t a,
+                                                      mshadow::half::half2_t b) {
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
+  return mshadow::half::half2_t(::__hneg2(::h2floor((b/a).half2_)));
+#else
+  return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
+                                  static_cast<float>(b.half_t2[0]/a.half_t2[0]))),
+                                mshadow::half::half_t(-::floorf(
+                                  static_cast<float>(b.half_t2[1]/a.half_t2[1]))));
+#endif
+}
 
 struct clip : public mxnet_op::tunable {
   template<typename DType>
diff --git a/src/operator/tensor/elemwise_binary_op.cuh b/src/operator/tensor/elemwise_binary_op.cuh
deleted file mode 100644
index 0bb9fa636f45..000000000000
--- a/src/operator/tensor/elemwise_binary_op.cuh
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_binary_op.cuh
- * \brief GPU helpers for elementwise operators
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-namespace binary {
-
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedBinaryKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
-};
-
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryKernelFwd(const VectorizedBinaryKernelParams<DType, 2, 1> params,
-                                          const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> loader1(params.inputs[1], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = loader0.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader0.load(tid, N);
-    loader1.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader0.nvec(); ++i) {
-      DType temp = OP::Map(loader0.separate()[i],
-                           loader1.separate()[i]);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-template <bool aligned, typename DType, typename LType,
-          typename LOP, typename ROP, int lreq, int rreq>
-__global__ void VectorizedBinaryKernelBwdUseNone(
-    const VectorizedBinaryKernelParams<DType, 1, 2> params,
-    const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
-  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
-
-  const index_t M = loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader.load(tid, N);
-    if (lreq == kAddTo) {
-      lstorer.load(tid, N);
-    }
-    if (rreq == kAddTo) {
-      rstorer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader.nvec(); ++i) {
-      DType inp = loader.separate()[i];
-      if (!((std::is_same<LOP, mshadow_op::identity>::value && lreq == kWriteInplace) ||
-            lreq == kNullOp)) {
-        DType ltemp = LOP::Map(inp);
-        if (lreq == kAddTo) {
-          lstorer.separate()[i] += ltemp;
-        } else {
-          lstorer.separate()[i] = ltemp;
-        }
-        lstorer.store(tid, N);
-      }
-      if (!((std::is_same<ROP, mshadow_op::identity>::value && rreq == kWriteInplace) ||
-            rreq == kNullOp)) {
-        DType rtemp = ROP::Map(inp);
-
-        if (rreq == kAddTo) {
-          rstorer.separate()[i] += rtemp;
-        } else {
-          rstorer.separate()[i] = rtemp;
-        }
-        rstorer.store(tid, N);
-      }
-    }
-  }
-}
-
-template <bool aligned, typename DType, typename LType,
-          typename LOP, typename ROP, int lreq, int rreq>
-__global__ void VectorizedBinaryKernelBwdUseIn(
-    const VectorizedBinaryKernelParams<DType, 3, 2> params,
-    const index_t N) {
-  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> linput_loader(params.inputs[1], N);
-  VectorizedLoader<DType, LType, aligned> rinput_loader(params.inputs[2], N);
-  VectorizedStorer<DType, LType, aligned> lstorer(params.outputs[0], N);
-  VectorizedStorer<DType, LType, aligned> rstorer(params.outputs[1], N);
-
-  const index_t M = ograd_loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    ograd_loader.load(tid, N);
-    linput_loader.load(tid, N);
-    rinput_loader.load(tid, N);
-    if (lreq == kAddTo) {
-      lstorer.load(tid, N);
-    }
-    if (rreq == kAddTo) {
-      rstorer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < ograd_loader.nvec(); ++i) {
-      DType ograd = ograd_loader.separate()[i];
-      DType linput = linput_loader.separate()[i];
-      DType rinput = rinput_loader.separate()[i];
-      if (!(lreq == kNullOp)) {
-        DType ltemp = ograd * LOP::Map(linput, rinput);
-        if (lreq == kAddTo) {
-          lstorer.separate()[i] += ltemp;
-        } else {
-          lstorer.separate()[i] = ltemp;
-        }
-        lstorer.store(tid, N);
-      }
-      if (!(rreq == kNullOp)) {
-        DType rtemp = ograd * ROP::Map(linput, rinput);
-
-        if (rreq == kAddTo) {
-          rstorer.separate()[i] += rtemp;
-        } else {
-          rstorer.separate()[i] = rtemp;
-        }
-        rstorer.store(tid, N);
-      }
-    }
-  }
-}
-
-template <typename DType, typename OP, int req>
-class VectorizedBinaryFwd {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 2, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
-class VectorizedBinaryBwdUseNone {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 1, 2>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelBwdUseNone<aligned, DType, LType, LOP, ROP, lreq, rreq>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-template <typename DType, typename LOP, typename ROP, int lreq, int rreq>
-class VectorizedBinaryBwdUseIn {
- public:
-  using ParamType = VectorizedBinaryKernelParams<DType, 3, 2>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryKernelBwdUseIn<aligned, DType, LType, LOP, ROP, lreq, rreq>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-}  // namespace binary
-
-template<typename OP>
-void ElemwiseBinaryOp::Compute_(const nnvm::NodeAttrs &attrs,
-                                mshadow::Stream<gpu> *s,
-                                const std::vector<TBlob> &inputs,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryFwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.inputs[1] = inputs[1].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
-}
-
-template<typename LOP, typename ROP>
-void ElemwiseBinaryOp::BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                                        mshadow::Stream<gpu>* s,
-                                        const std::vector<TBlob> &inputs,
-                                        const std::vector<OpReqType> &req,
-                                        const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
-
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    const index_t size = inputs[0].Size();
-    if (req[0] != kNullOp || req[1] != kNullOp) {
-      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
-        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
-          using LType = uint4;
-          using Kernel = VectorizedBinaryBwdUseNone<DType, LOP, ROP, lreq, rreq>;
-
-          typename Kernel::ParamType params;
-          params.inputs[0] = inputs[0].dptr<DType>();
-          params.outputs[0] = outputs[0].dptr<DType>();
-          params.outputs[1] = outputs[1].dptr<DType>();
-
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        });
-      });
-    }
-  });
-}
-
-template<typename LOP, typename ROP>
-void ElemwiseBinaryOp::BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                                      mshadow::Stream<gpu>* s,
-                                      const std::vector<TBlob> &inputs,
-                                      const std::vector<OpReqType> &req,
-                                      const std::vector<TBlob> &outputs) {
-  using namespace binary;
-  if (req[0] != kNullOp || req[1] != kNullOp) {
-    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MXNET_REQ_TYPE_SWITCH(req[0], lreq, {
-        MXNET_REQ_TYPE_SWITCH(req[1], rreq, {
-          const index_t size = inputs[0].Size();
-          // Using 64 bit loads to reduce register pressure
-          using LType = uint2;
-          using Kernel = VectorizedBinaryBwdUseIn<DType, LOP, ROP, lreq, rreq>;
-
-          typename Kernel::ParamType params;
-          params.inputs[0] = inputs[0].dptr<DType>();
-          params.inputs[1] = inputs[1].dptr<DType>();
-          params.inputs[2] = inputs[2].dptr<DType>();
-          params.outputs[0] = outputs[0].dptr<DType>();
-          params.outputs[1] = outputs[1].dptr<DType>();
-
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        });
-      });
-    });
-  }
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index b9396aee204e..bc5140a5d75f 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -106,85 +106,62 @@ class ElemwiseBinaryOp : public OpBase {
   }
 
  private:
-  template<typename LOP, typename ROP>
+  template<typename xpu, typename LOP, typename ROP, typename DType>
   static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                               mshadow::Stream<cpu>* s,
+                               const OpContext &ctx,
                                const std::vector<TBlob> &inputs,
                                const std::vector<OpReqType> &req,
                                const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using namespace mxnet_op;
-      const int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
-                                        / DataType<DType>::kLanes);
-      const DType *ograd_dptr = inputs[0].dptr<DType>();
-      if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
-        CHECK_EQ(ograd_dptr, outputs[0].dptr<DType>());
-      } else if (req[0] != kNullOp) {
-        DType *lgrad_dptr = outputs[0].dptr<DType>();
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          Kernel<mxnet_op::op_with_req<LOP, Req>, cpu>::Launch(s, size, lgrad_dptr, ograd_dptr);
-        });
-      }
-      if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
-        CHECK_EQ(ograd_dptr, outputs[1].dptr<DType>());
-      } else if (req[1] != kNullOp) {
-        DType *rgrad_dptr = outputs[1].dptr<DType>();
-        MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
-          Kernel<mxnet_op::op_with_req<ROP, Req>, cpu>::Launch(s, size, rgrad_dptr, ograd_dptr);
-        });
-      }
-    });
-  }
-#if MXNET_USE_CUDA
-  template<typename LOP, typename ROP>
-  static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
-                               mshadow::Stream<gpu>* s,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs);
-#endif
-
-  template<typename LOP, typename ROP>
-  static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                             mshadow::Stream<cpu>* s,
-                             const std::vector<TBlob> &inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<TBlob> &outputs) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      DCHECK_EQ(outputs.size(), 2U);
-      DCHECK_EQ(inputs.size(), 3U);
-      const DType *ograd_dptr = inputs[0].dptr<DType>();
-      const DType *lhs_dptr = inputs[1].dptr<DType>();
-      const DType *rhs_dptr = inputs[2].dptr<DType>();
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
+                                      / DataType<DType>::kLanes);
+    const DType *ograd_dptr = inputs[0].dptr<DType>();
+    if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
+      CHECK_EQ(ograd_dptr, outputs[0].dptr<DType>());
+    } else if (req[0] != kNullOp) {
+      DType *lgrad_dptr = outputs[0].dptr<DType>();
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        const int size = static_cast<int>(
-          (outputs[0].Size() + mxnet_op::DataType<DType>::kLanes - 1)
-          / mxnet_op::DataType<DType>::kLanes);
-        DType * lgrad_dptr = outputs[0].dptr<DType>();
-        mxnet_op::Kernel<
-          mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<LOP>, Req>, cpu>::Launch(
-            s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);
+        Kernel<mxnet_op::op_with_req<LOP, Req>, xpu>::Launch(s, size, lgrad_dptr, ograd_dptr);
       });
+    }
+    if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
+      CHECK_EQ(ograd_dptr, outputs[1].dptr<DType>());
+    } else if (req[1] != kNullOp) {
+      DType *rgrad_dptr = outputs[1].dptr<DType>();
       MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
-        const int size = static_cast<int>(
-          (outputs[1].Size() + mxnet_op::DataType<DType>::kLanes - 1)
-          / mxnet_op::DataType<DType>::kLanes);
-        DType * rgrad_dptr = outputs[1].dptr<DType>();
-        mxnet_op::Kernel<
-          mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<ROP>, Req>, cpu>::Launch(
-            s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);
+        Kernel<mxnet_op::op_with_req<ROP, Req>, xpu>::Launch(s, size, rgrad_dptr, ograd_dptr);
       });
-    });
+    }
   }
 
-#if MXNET_USE_CUDA
-  template<typename LOP, typename ROP>
+  template<typename xpu, typename LOP, typename ROP, typename DType>
   static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
-                             mshadow::Stream<gpu>* s,
+                             const OpContext &ctx,
                              const std::vector<TBlob> &inputs,
                              const std::vector<OpReqType> &req,
-                             const std::vector<TBlob> &outputs);
-#endif
+                             const std::vector<TBlob> &outputs) {
+    DCHECK_EQ(outputs.size(), 2U);
+    DCHECK_EQ(inputs.size(), 3U);
+    mxnet_op::Stream<xpu> *s = ctx.get_stream<xpu>();
+    const DType *ograd_dptr = inputs[0].dptr<DType>();
+    const DType *lhs_dptr = inputs[1].dptr<DType>();
+    const DType *rhs_dptr = inputs[2].dptr<DType>();
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      const int size = static_cast<int>(
+        (outputs[0].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+        / mxnet_op::DataType<DType>::kLanes);
+      DType * lgrad_dptr = outputs[0].dptr<DType>();
+      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<LOP>, Req>, xpu>::Launch(
+        s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
+    MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
+      const int size = static_cast<int>(
+        (outputs[1].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+        / mxnet_op::DataType<DType>::kLanes);
+      DType * rgrad_dptr = outputs[1].dptr<DType>();
+      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad_tuned<ROP>, Req>, xpu>::Launch(
+        s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
+  }
 
   template<
     typename xpu,
@@ -521,13 +498,15 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<cpu> *s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
     using namespace mxnet_op;
+    if (req[0] == kNullOp) return;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     CHECK_EQ(inputs.size(), 2U);
     CHECK_EQ(outputs.size(), 1U);
     if (outputs[0].type_flag_ == mshadow::kBool) {
@@ -538,7 +517,7 @@ class ElemwiseBinaryOp : public OpBase {
         const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
         + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
         if (size != 0) {
-          Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(s, size,
+          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
           outputs[0].dptr<DType>(),
           inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
         }
@@ -546,26 +525,6 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<gpu> *s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs);
-#endif
-
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs &attrs,
-                      const OpContext &ctx,
-                      const std::vector<TBlob> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &outputs) {
-    if (req[0] == kNullOp) return;
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    Compute_<OP>(attrs, s, inputs, req, outputs);
-  }
-
   template<typename xpu, typename OP>
   static void ComputeWithBool(const nnvm::NodeAttrs &attrs,
                               const OpContext &ctx,
@@ -615,6 +574,30 @@ class ElemwiseBinaryOp : public OpBase {
     });
   }
 
+  template<typename xpu, typename OP>
+  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+    using namespace mxnet_op;
+    if (req[0] == kNullOp) return;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    CHECK_EQ(inputs.size(), 2U);
+    CHECK_EQ(outputs.size(), 1U);
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+        const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
+        + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
+        if (size != 0) {
+          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+          outputs[0].dptr<DType>(),
+          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+        }
+      });
+    });
+  }
+
   template<typename xpu, typename OP>
   static void ComputeEx(const nnvm::NodeAttrs &attrs,
                         const OpContext &ctx,
@@ -711,8 +694,20 @@ class ElemwiseBinaryOp : public OpBase {
                                      const std::vector<TBlob> &inputs,
                                      const std::vector<OpReqType> &req,
                                      const std::vector<TBlob> &outputs) {
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    BackwardUseNone_<LOP, ROP>(attrs, s, inputs, req, outputs);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseNoneWithHalf2(const nnvm::NodeAttrs &attrs,
+                                              const OpContext &ctx,
+                                              const std::vector<TBlob> &inputs,
+                                              const std::vector<OpReqType> &req,
+                                              const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
   }
 
   template<typename xpu, typename LOP, typename ROP>
@@ -756,8 +751,20 @@ class ElemwiseBinaryOp : public OpBase {
                                    const std::vector<TBlob> &inputs,
                                    const std::vector<OpReqType> &req,
                                    const std::vector<TBlob> &outputs) {
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    BackwardUseIn_<LOP, ROP>(attrs, s, inputs, req, outputs);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseInWithHalf2(const nnvm::NodeAttrs &attrs,
+                                            const OpContext &ctx,
+                                            const std::vector<TBlob> &inputs,
+                                            const std::vector<OpReqType> &req,
+                                            const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
   }
 
   template<
@@ -856,9 +863,4 @@ class ElemwiseBinaryOp : public OpBase {
 
 }  // namespace op
 }  // namespace mxnet
-
-#ifdef __CUDACC__
-#include "elemwise_binary_op.cuh"
-#endif  // __CUDACC__
-
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index b21b08d03217..16d7fc1ad72b 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -218,51 +218,52 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<gpu> *s,
 }
 
 NNVM_REGISTER_OP(elemwise_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::plus>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_grad_add)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::plus>);
 
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
+                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
                     mshadow_op::identity>);
 
 NNVM_REGISTER_OP(elemwise_sub)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::minus>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<
+  gpu, op::mshadow_op::minus>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::minus>);
 
 NNVM_REGISTER_OP(_backward_sub)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseNone<gpu, mshadow_op::identity,
+                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
                     mshadow_op::negation>);
 
 NNVM_REGISTER_OP(elemwise_mul)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::mul>)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::mul>)
 .set_attr<FComputeEx>("FComputeEx<gpu>",
   ElemwiseBinaryOp::ComputeDnsLRValueEx<gpu, op::mshadow_op::mul, true, true>);
 
 NNVM_REGISTER_OP(_backward_mul)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::right,
+                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::right,
                     mshadow_op::left>);
 
 NNVM_REGISTER_OP(elemwise_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::Compute<gpu, op::mshadow_op::div>);
+                    ElemwiseBinaryOp::ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::div>);
 
 NNVM_REGISTER_OP(_backward_div)
 .set_attr<FCompute>("FCompute<gpu>",
-                    ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::div_grad,
+                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::div_grad,
                     mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_mod)
-.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod)
 .set_attr<FCompute>("FCompute<gpu>",
-  ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
+  ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.cuh b/src/operator/tensor/elemwise_binary_scalar_op.cuh
deleted file mode 100644
index 062c18767ac6..000000000000
--- a/src/operator/tensor/elemwise_binary_scalar_op.cuh
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_binary_scalar_op.cuh
- * \brief GPU helpers for binary elementwise operators with scalar
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-namespace binary_scalar {
-
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
-  DType scalar;
-};
-
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
-                                                const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader0(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = loader0.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader0.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader0.nvec(); ++i) {
-      DType temp = OP::Map(loader0.separate()[i],
-                           params.scalar);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedBinaryScalarKernelBwd(const VectorizedKernelParams<DType, 2, 1> params,
-                                                const index_t N) {
-  VectorizedLoader<DType, LType, aligned> ograd_loader(params.inputs[0], N);
-  VectorizedLoader<DType, LType, aligned> input_loader(params.inputs[1], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = ograd_loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    ograd_loader.load(tid, N);
-    input_loader.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < ograd_loader.nvec(); ++i) {
-      DType ograd = ograd_loader.separate()[i];
-      DType temp = ograd * OP::Map(input_loader.separate()[i],
-                                   params.scalar);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-template <typename DType, typename OP, int req>
-class VectorizedBinaryScalarFwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 1, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryScalarKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-template <typename DType, typename OP, int req>
-class VectorizedBinaryScalarBwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 2, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedBinaryScalarKernelBwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-}  // namespace binary_scalar
-
-template <typename OP>
-void BinaryScalarOp::Compute_(const nnvm::NodeAttrs &attrs,
-                              mshadow::Stream<gpu>* s,
-                              const std::vector<TBlob> &inputs,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<TBlob> &outputs) {
-  using namespace binary_scalar;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  const double alpha = nnvm::get<double>(attrs.parsed);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryScalarFwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-      params.scalar = (DType)alpha;
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
-}
-
-template <typename OP>
-void BinaryScalarOp::Backward_(const nnvm::NodeAttrs &attrs,
-                               mshadow::Stream<gpu>* s,
-                               const std::vector<TBlob> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<TBlob> &outputs) {
-  using namespace binary_scalar;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  const double alpha = nnvm::get<double>(attrs.parsed);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedBinaryScalarBwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.inputs[1] = inputs[1].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-      params.scalar = (DType)alpha;
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_CUH_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 53161ee2354f..4eaaff09d83d 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -225,44 +225,26 @@ class BinaryScalarOp : public UnaryOp {
   }
 
  public:
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<cpu>* s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
     DCHECK_EQ(inputs.size(), 1);
     DCHECK_EQ(outputs.size(), 1);
     using namespace mshadow;
     using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     const double alpha = nnvm::get<double>(attrs.parsed);
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
           s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
       });
     });
   }
 
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs &attrs,
-                       mshadow::Stream<gpu>* s,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs);
-#endif
-
-  template<typename xpu, typename OP>
-  static void Compute(const nnvm::NodeAttrs &attrs,
-                      const OpContext &ctx,
-                      const std::vector<TBlob> &inputs,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &outputs) {
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    Compute_<OP>(attrs, s, inputs, req, outputs);
-  }
-
   template<typename xpu, typename OP>
   static void ComputeInt(const nnvm::NodeAttrs &attrs,
                          const OpContext &ctx,
@@ -354,46 +336,26 @@ class BinaryScalarOp : public UnaryOp {
     }
   }
 
-  template<typename OP>
-  static void Backward_(const nnvm::NodeAttrs &attrs,
-                        mshadow::Stream<cpu>* s,
-                        const std::vector<TBlob> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &outputs) {
+  template<typename xpu, typename OP>
+  static void Backward(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     const double alpha = nnvm::get<double>(attrs.parsed);
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet::op::mxnet_op::Kernel<mxnet::op::mxnet_op::op_with_req<
-          mxnet::op::mxnet_op::backward_grad_tuned<OP>, Req>, cpu>::
+          mxnet::op::mxnet_op::backward_grad_tuned<OP>, Req>, xpu>::
           Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
                  inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
                  DType(alpha));
       });
     });
   }
-
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Backward_(const nnvm::NodeAttrs &attrs,
-                        mshadow::Stream<gpu>* s,
-                        const std::vector<TBlob> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &outputs);
-#endif
-
-  template<typename xpu, typename OP>
-  static void Backward(const nnvm::NodeAttrs &attrs,
-                       const OpContext &ctx,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Backward_<OP>(attrs, s, inputs, req, outputs);
-  }
 };
 
 #define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                 \
@@ -414,9 +376,4 @@ class BinaryScalarOp : public UnaryOp {
 
 }  // namespace op
 }  // namespace mxnet
-
-#ifdef __CUDACC__
-#include "elemwise_binary_scalar_op.cuh"
-#endif
-
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_SCALAR_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index 3fd017f09ec7..3c839205683a 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -57,19 +57,22 @@ NNVM_REGISTER_OP(_rdiv_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
 
 NNVM_REGISTER_OP(_backward_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rdiv_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu,
+  mshadow_op::rdiv_grad>);
 
 NNVM_REGISTER_OP(_mod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::mod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::mod_grad>);
 
 NNVM_REGISTER_OP(_rmod_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
 
 NNVM_REGISTER_OP(_backward_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rmod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::rmod_grad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index f09e40a2eee7..2bd52d7b9d7c 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -44,25 +44,30 @@ NNVM_REGISTER_OP(_power_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
 
 NNVM_REGISTER_OP(_backward_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::power_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::power_grad>);
 
 NNVM_REGISTER_OP(_rpower_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
 
 NNVM_REGISTER_OP(_backward_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::rpower_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::rpower_grad>);
 
 NNVM_REGISTER_OP(_hypot_scalar)
 .set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::hypot>);
 
 NNVM_REGISTER_OP(_backward_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::hypot_grad_left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::smooth_l1_loss>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<
+  gpu, mshadow_op::smooth_l1_loss>);
 
 NNVM_REGISTER_OP(_backward_smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::smooth_l1_gradient>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::smooth_l1_gradient>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index 352c74ea9445..f9a248214e85 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -24,118 +24,10 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
-#include "../../common/cuda_vectorization.cuh"
 
 namespace mxnet {
 namespace op {
 
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-namespace {
-
-constexpr size_t num_inputs_per_kernel = 4;
-
-template <typename DType, int NumInputs>
-struct VectorizedElementwiseSumKernelParams {
-  int num_inputs;
-  const DType* inputs[NumInputs];
-  DType* outputs[1];
-};
-
-template <bool aligned, typename DType, typename LType, int req>
-__launch_bounds__(mxnet::common::cuda::vectorized_kernel_thread_num)
-__global__ void VectorizedElementwiseSumKernel(
-    const VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel> params,
-    const index_t N) {
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = storer.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-      tid < M;
-      tid += gridDim.x * blockDim.x) {
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    } else {
-#pragma unroll
-      for (int i = 0; i < storer.nvec(); ++i) {
-        storer.separate()[i] = 0;
-      }
-    }
-#pragma unroll
-    for (int i = 0; i < num_inputs_per_kernel; ++i) {
-      if (i < params.num_inputs) {
-        VectorizedLoader<DType, LType, aligned> loader(params.inputs[i], N);
-        loader.load(tid, N);
-#pragma unroll
-        for (int i = 0; i < loader.nvec(); ++i) {
-          storer.separate()[i] += loader.separate()[i];
-        }
-      }
-    }
-
-    storer.store(tid, N);
-  }
-}
-
-
-template <typename DType, int req>
-class VectorizedElementwiseSumFwd {
- public:
-  using ParamType = VectorizedElementwiseSumKernelParams<DType, num_inputs_per_kernel>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedElementwiseSumKernel<aligned, DType, LType, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
-                              const OpContext &ctx,
-                              const std::vector<TBlob> &inputs,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<TBlob> &outputs) {
-  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint2;
-      const index_t size = inputs[0].Size();
-      for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
-        if (i == 0) {
-          using Kernel = VectorizedElementwiseSumFwd<DType, Req>;
-          typename Kernel::ParamType params;
-          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-          for (int j = 0; j < params.num_inputs; ++j) {
-            params.inputs[j] = inputs[i + j].dptr<DType>();
-          }
-          params.outputs[0] = outputs[0].dptr<DType>();
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        } else {
-          /* During subsequent launches we need to
-             accumulate into the previous outputs
-          */
-          using Kernel = VectorizedElementwiseSumFwd<DType, kAddTo>;
-          typename Kernel::ParamType params;
-          params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-          for (int j = 0; j < params.num_inputs; ++j) {
-            params.inputs[j] = inputs[i + j].dptr<DType>();
-          }
-          params.outputs[0] = outputs[0].dptr<DType>();
-          VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-        }
-      }
-    });
-  });
-}
-
 void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
@@ -159,10 +51,8 @@ void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
   }
 }
 
-}  // namespace
-
 NNVM_REGISTER_OP(add_n)
-.set_attr<FCompute>("FCompute<gpu>", VectorizedElementwiseSum)
+.set_attr<FCompute>("FCompute<gpu>", ElementWiseSumComputeWithHalf2<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElementWiseSumComputeExGPU);
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index d40ab4de0f0f..259c80ddddac 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -113,6 +113,18 @@ void ElementWiseSumCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<typename xpu>
+void ElementWiseSumComputeWithHalf2(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      ElementWiseSumCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
+  });
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_SUM_H_
diff --git a/src/operator/tensor/elemwise_unary_op.cuh b/src/operator/tensor/elemwise_unary_op.cuh
deleted file mode 100644
index 8688a8b8ac66..000000000000
--- a/src/operator/tensor/elemwise_unary_op.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2020 by Contributors
- * \file elemwise_unary_op.cuh
- * \brief GPU helpers for unary elementwise operators
- */
-
-#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
-#define MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
-
-#include <cuda_runtime.h>
-#include "../operator_common.h"
-#include "../../common/cuda_vectorization.cuh"
-
-#include <vector>
-
-#if MXNET_USE_CUDA
-
-namespace mxnet {
-namespace op {
-
-namespace unary {
-
-using common::cuda::VectorizedKernelLauncher;
-using common::cuda::VectorizedLoader;
-using common::cuda::VectorizedStorer;
-
-template <typename DType, int NumInputs, int NumOutputs>
-struct VectorizedKernelParams {
-  const DType* inputs[NumInputs];
-  DType* outputs[NumOutputs];
-};
-
-template <bool aligned, typename DType, typename LType, typename OP, int req>
-__global__ void VectorizedUnaryScalarKernelFwd(const VectorizedKernelParams<DType, 1, 1> params,
-                                               const index_t N) {
-  VectorizedLoader<DType, LType, aligned> loader(params.inputs[0], N);
-  VectorizedStorer<DType, LType, aligned> storer(params.outputs[0], N);
-
-  const index_t M = loader.num_aligned_elements();
-
-  for (index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-       tid < M;
-       tid += gridDim.x * blockDim.x) {
-    loader.load(tid, N);
-    if (req == kAddTo) {
-      storer.load(tid, N);
-    }
-#pragma unroll
-    for (int i = 0; i < loader.nvec(); ++i) {
-      DType temp = OP::Map(loader.separate()[i]);
-
-      if (req == kAddTo) {
-        storer.separate()[i] += temp;
-      } else {
-        storer.separate()[i] = temp;
-      }
-    }
-    storer.store(tid, N);
-  }
-}
-
-template <typename DType, typename OP, int req>
-class VectorizedUnaryScalarFwd {
- public:
-  using ParamType = VectorizedKernelParams<DType, 1, 1>;
-
-  template <bool aligned, typename LType>
-  static void Launch(const index_t blocks, const index_t threads,
-                     cudaStream_t stream,
-                     const ParamType params, const index_t lead_dim,
-                     const index_t /* other_dim */) {
-    VectorizedUnaryScalarKernelFwd<aligned, DType, LType, OP, req>
-      <<<blocks, threads, 0, stream>>>(params, lead_dim);
-  }
-};
-
-}  // namespace unary
-
-template<typename OP>
-void UnaryOp::Compute_(const nnvm::NodeAttrs& attrs,
-                     mshadow::Stream<gpu>* s,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace unary;
-  if (req[0] == kNullOp) return;
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      using LType = uint4;
-      using Kernel = VectorizedUnaryScalarFwd<DType, OP, Req>;
-
-      const index_t size = outputs[0].Size();
-      typename Kernel::ParamType params;
-      params.inputs[0] = inputs[0].dptr<DType>();
-      params.outputs[0] = outputs[0].dptr<DType>();
-
-      VectorizedKernelLauncher<DType, LType, Kernel>(size, 1, s, params);
-    });
-  });
-}
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA
-#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_CUH_
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 86686c6f1278..dcbd53aac69b 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -235,32 +235,6 @@ class UnaryOp : public OpBase {
     }
   }
 
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<cpu>* s,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        if (inputs[0].Size() != 0) {
-          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
-            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
-        }
-      });
-    });
-  }
-
-#if MXNET_USE_CUDA
-  template<typename OP>
-  static void Compute_(const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<gpu>* s,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs);
-
-#endif
-
   template<typename xpu, typename OP>
   static void Compute(const nnvm::NodeAttrs& attrs,
                       const OpContext& ctx,
@@ -268,7 +242,14 @@ class UnaryOp : public OpBase {
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    Compute_<OP>(attrs, s, inputs, req, outputs);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        if (inputs[0].Size() != 0) {
+          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+        }
+      });
+    });
   }
 
   template<typename xpu, typename OP>
@@ -363,6 +344,23 @@ class UnaryOp : public OpBase {
   }
 #endif
 
+  template<typename xpu, typename op>
+  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      Kernel<op, xpu>::Launch(s, outputs[0].Size(),
+                              outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+    });
+  }
+
   template<typename xpu>
   static void IdentityCompute(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
@@ -879,8 +877,4 @@ void NumpyNanToNumOpBackward(const nnvm::NodeAttrs& attrs,
 }  // namespace op
 }  // namespace mxnet
 
-#ifdef __CUDACC__
-#include "elemwise_unary_op.cuh"
-#endif
-
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 7c0550735519..e5b60b1726e6 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -22,7 +22,6 @@
  * \brief GPU Implementation of unary functions.
  */
 #include "./elemwise_binary_op.h"
-#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op_pow.cu b/src/operator/tensor/elemwise_unary_op_pow.cu
index 287a2e87be73..4dbdf349cdb0 100644
--- a/src/operator/tensor/elemwise_unary_op_pow.cu
+++ b/src/operator/tensor/elemwise_unary_op_pow.cu
@@ -22,7 +22,6 @@
  * \brief GPU Implementation of power (x^k for fixed k) functions.
  */
 #include "./elemwise_binary_op.h"
-#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu
index f5e9d1ccbd6c..8e28b9c609fa 100644
--- a/src/operator/tensor/elemwise_unary_op_trig.cu
+++ b/src/operator/tensor/elemwise_unary_op_trig.cu
@@ -22,7 +22,6 @@
  * \brief GPU Implementation of unary trigonometric function.
  */
 #include "./elemwise_binary_op.h"
-#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e22d529eeb41..c73b8456240b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9895,85 +9895,6 @@ def test_elemwise_sum_for_gradient_accumulation():
         assert stored_grad['write'] == stored_grad['add']
         assert stored_grad['write'] == 2 * nrepeat
 
-@with_seed()
-def test_elementwise_ops_on_misaligned_input():
-    a = mx.nd.array([1,2,3,4], dtype='float16')
-    b = mx.nd.array([1,2,3,4], dtype='float16')
-
-    c = a[1:3]
-    d = b[1:3]
-    # Note: testing just elemwise_add since all elemwise_ops
-    #       share the implementation
-    mx.nd.elemwise_add(c, d, out=c)
-    mx.nd.waitall()
-
-    a = mx.nd.array([1,2,3,4], dtype='float16')
-    b = mx.nd.array([1,2,3,4], dtype='float16')
-
-    c = a[0:3]
-    d = b[0:3]
-    mx.nd.elemwise_add(c, d, out=c)
-    mx.nd.waitall()
-    assert a[3].asscalar() == 4.0
-
-@with_seed()
-def test_broadcast_ops_on_misaligned_input():
-    dtypes = ['float16', 'float32', 'float64']
-    lead_dims = [2,3,4,6,10]
-
-    for dtype in dtypes:
-        for lead_dim in lead_dims:
-            for both_ways in [False, True]:
-                shape = list(rand_shape_2d()) + [lead_dim]
-                small_shape = [shape[0], 1, lead_dim]
-                if both_ways:
-                    # Broadcast in both ways [1, K, L] x [M, 1, L]
-                    big_shape = [1, shape[1], lead_dim]
-                else:
-                    big_shape = shape
-                size = np.product(shape)
-                small_size = np.product(small_shape)
-                big_size = np.product(big_shape)
-                a = mx.nd.arange(5000)
-                b = mx.nd.arange(5000)
-                e = mx.nd.arange(5000)
-                c = a[1:big_size + 1].reshape(big_shape)
-                d = b[1:small_size + 1].reshape(small_shape)
-                f = e[1:size + 1].reshape(shape)
-                mx.nd.broadcast_add(c, d, out=f)
-                expected = c.asnumpy() + d.asnumpy()
-                mx.nd.waitall()
-                assert_almost_equal(f, expected)
-
-@with_seed()
-def test_broadcast_ops_on_misaligned_input_oneside():
-    dtypes = ['float16', 'float32', 'float64']
-    lead_dims = [2,3,4,6,10]
-
-    for dtype in dtypes:
-        for lead_dim in lead_dims:
-            for both_ways in [False, True]:
-                shape = list(rand_shape_2d()) + [lead_dim]
-                small_shape = [shape[0], shape[1], 1]
-                if both_ways:
-                    # Broadcast in both ways [1, K, L] x [M, 1, 1]
-                    big_shape = [1, shape[1], lead_dim]
-                else:
-                    big_shape = shape
-                size = np.product(shape)
-                small_size = np.product(small_shape)
-                big_size = np.product(big_shape)
-                a = mx.nd.arange(5000)
-                b = mx.nd.arange(5000)
-                e = mx.nd.arange(5000)
-                c = a[1:big_size + 1].reshape(big_shape)
-                d = b[1:small_size + 1].reshape(small_shape)
-                f = e[1:size + 1].reshape(shape)
-                mx.nd.broadcast_add(c, d, out=f)
-                expected = c.asnumpy() + d.asnumpy()
-                mx.nd.waitall()
-                assert_almost_equal(f, expected)
-
 def test_scalarop_locale_invariance():
     arr = mx.nd.zeros((1,))
     prev = locale.getlocale(locale.LC_NUMERIC)
@@ -9993,7 +9914,7 @@ def test_scalarop_locale_invariance():
                 break
             except locale.Error as e:
                 print("Couldn't enable locale", loc, ": ", str(e))
-                
+
         if locale_set:
             scalar = 0.3
             assert "," in locale.str(scalar)