apache · piiswrong · Aug 16, 2017 · Aug 9, 2017 · Aug 9, 2017 · Aug 10, 2017
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
@@ -153,6 +153,16 @@ inline const char* CurandGetErrorString(curandStatus_t status) {
   return "Unknown cuRAND status";
 }
 
+template <typename DType>
+inline DType __device__ CudaMax(DType a, DType b) {
+    return a > b ? a : b;
+}
+
+template <typename DType>
+inline DType __device__ CudaMin(DType a, DType b) {
+    return a < b ? a : b;
+}
+
 }  // namespace cuda
 }  // namespace common
 }  // namespace mxnet
@@ -219,6 +229,14 @@ inline const char* CurandGetErrorString(curandStatus_t status) {
         << "cuRAND: " << common::cuda::CurandGetErrorString(e); \
   }
 
+#if !defined(_MSC_VER)
+#define CUDA_UNROLL _Pragma("unroll")
+#define CUDA_NOUNROLL _Pragma("nounroll")
+#else
+#define CUDA_UNROLL
+#define CUDA_NOUNROLL
+#endif
+
 /*!
  * \brief Determine major version number of the gpu's cuda compute architecture.
  * \param device_id The device index of the cuda-capable gpu of interest.
@@ -291,7 +309,6 @@ inline bool GetEnvAllowTensorCore() {
   return dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE",
                       dmlc::optional<bool>(default_value)).value();
 }
-
 #endif  // MXNET_USE_CUDA
 
 #if MXNET_USE_CUDNN
@@ -401,6 +418,15 @@ static inline __device__ void atomicAdd(mshadow::half::half_t *address,
     old = atomicCAS(address_as_ui, assumed, old);
   } while (assumed != old);
 }
+
+template <typename DType>
+__device__ inline DType ldg(const DType* address) {
+#if __CUDA_ARCH__ >= 350
+    return __ldg(address);
+#else
+    return *address;
+#endif
+}
 #endif
 
 #endif  // MXNET_COMMON_CUDA_UTILS_H_
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
@@ -29,6 +29,8 @@
 #include "./cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
 
+#include "./depthwise_convolution-inl.h"
+
 namespace mxnet {
 namespace op {
 
@@ -45,6 +47,18 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
     })
     return op;
   }
+
+  // depth wise conv
+  if (param.num_filter == param.num_group &&
+      param.layout.value() == mshadow::kNCHW &&
+      param.num_filter == (*in_shape)[conv::kData][1] &&
+      param.kernel.ndim() == 2 &&
+      param.dilate == mshadow::Shape2(1, 1) &&
+      dtype == mshadow::kFloat32) {
+    op = new DepthwiseConvolutionOp<float>(param, *in_shape, *out_shape);
+    return op;
+  }
+
 #if MXNET_USE_CUDNN == 1
   // The NVIDIA Pascal architecture was the first to include 16-bit ALUs.
   // Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we