[CI] run operator tests with naive engine (#18252)

* run operator tests with naive engine * fix take tests * update skip mark * fix cuda error reset * adjust tests * disable parallel testing and naive engine for mkl/mkldnn #18244
apache · May 17, 2020 · 10b6b48 · 10b6b48
1 parent 9482728
commit 10b6b48
Show file tree

Hide file tree

Showing 45 changed files with 496 additions and 448 deletions.
diff --git a/.github/workflows/os_x_staticbuild.yml b/.github/workflows/os_x_staticbuild.yml
@@ -22,4 +22,6 @@ jobs:
           python3 -m pip install --user -e python
       - name: Test project
         run: |
-          python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)'
+          python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
+          MXNET_ENGINE_TYPE=NaiveEngine python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
+          python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
@@ -251,7 +251,7 @@ extern "C" {
     if (e == cudaErrorCudartUnloading) {                           \
       throw dmlc::Error(cudaGetErrorString(e));                    \
     }                                                              \
-    CHECK(e == cudaSuccess)                                        \
+    CHECK_EQ(e, cudaSuccess)                                       \
         << "CUDA: " << cudaGetErrorString(e);                      \
   }
 

diff --git a/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh b/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
@@ -35,7 +35,7 @@
 #define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
   /* Code block avoids redefinition of cudaError_t err */ \
   do { \
-    cudaError err = cudaPeekAtLastError(); \
+    cudaError err = cudaGetLastError(); \
     CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
   } while (0)
 namespace mshadow {

diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
diff --git a/3rdparty/mshadow/mshadow/tensor.h b/3rdparty/mshadow/mshadow/tensor.h
diff --git a/3rdparty/mshadow/mshadow/tensor_cpu-inl.h b/3rdparty/mshadow/mshadow/tensor_cpu-inl.h
diff --git a/3rdparty/mshadow/mshadow/tensor_gpu-inl.h b/3rdparty/mshadow/mshadow/tensor_gpu-inl.h
diff --git a/tests/python/unittest/test_metric_perf.py → benchmark/python/metric/benchmark_metric.py b/tests/python/unittest/test_metric_perf.py → benchmark/python/metric/benchmark_metric.py
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -944,11 +944,12 @@ cd_unittest_ubuntu() {
     export MXNET_ENABLE_CYTHON=0
     export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
 
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
-    pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
+    pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --verbose tests/python/unittest
     pytest -n 4 --durations=50 --verbose tests/python/quantization
 
@@ -958,11 +959,16 @@ cd_unittest_ubuntu() {
     # fi
 
     if [[ ${mxnet_variant} = cu* ]]; then
-        pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/gpu
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        MXNET_ENGINE_TYPE=NaiveEngine \
+            pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/gpu
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+            pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/gpu
         pytest -m 'serial' --durations=50 --verbose tests/python/gpu
 
         # Adding these here as CI doesn't test all CUDA environments
-        pytest -n 4 example/image-classification/test_score.py
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+            pytest -n 4 example/image-classification/test_score.py
         # TODO(szha): fix and reenable the hanging issue. tracked in #18098
         # integrationtest_ubuntu_gpu_dist_kvstore
     fi
@@ -980,11 +986,26 @@ unittest_ubuntu_python3_cpu() {
     export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -n 4 --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
 }
 
+unittest_ubuntu_python3_cpu_serial() {
+    # TODO(szha): delete this and switch to unittest_ubuntu_python3_cpu once #18244 is fixed
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
+}
+
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
     export PYTHONPATH=./python/
@@ -993,9 +1014,9 @@ unittest_ubuntu_python3_cpu_mkldnn() {
     export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
-    pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
-    pytest -n 4 --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
+    # TODO(szha): enable parallel testing and naive engine for ops once #18244 is fixed
+    pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
 }
 
 unittest_ubuntu_python3_gpu() {
@@ -1007,8 +1028,11 @@ unittest_ubuntu_python3_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1022,9 +1046,12 @@ unittest_ubuntu_python3_gpu_cython() {
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
     check_cython
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1036,8 +1063,11 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     export CUDNN_OFF_TEST_ONLY=true
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1050,9 +1080,9 @@ unittest_ubuntu_tensorrt_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
     python3 tests/python/tensorrt/lenet5_train.py
-    pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
     pytest -k 'not test_ops' --durations=50 --cov-report xml:tests_trt_gpu.xml --cov-append --verbose --capture=no tests/python/tensorrt/
 }
 
@@ -1070,8 +1100,8 @@ unittest_ubuntu_python3_quantization_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 unittest_centos7_cpu_scala() {
@@ -1213,7 +1243,9 @@ unittest_centos7_cpu() {
     set -ex
     source /opt/rh/rh-python36/enable
     cd /work/mxnet
-    python -m pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    python -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        python -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     python -m pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     python -m pytest -n 4 --durations=50 --cov-report xml:tests_train.xml --verbose tests/python/train
 }
@@ -1224,8 +1256,11 @@ unittest_centos7_gpu() {
     cd /work/mxnet
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1342,7 +1377,9 @@ test_ubuntu_cpu_python3() {
     cd /work/mxnet/python
     pip3 install -e .
     cd /work/mxnet
-    python3 -m pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
+    python3 -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        python3 -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
     python3 -m pytest -m 'serial' --durations=50 --verbose tests/python/unittest
 
     popd

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -50,6 +50,12 @@ def python3_ut(docker_container_name) {
   }
 }
 
+def python3_ut_serial(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_serial', false)
+  }
+}
+
 def python3_ut_mkldnn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
     utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
@@ -803,7 +809,7 @@ def test_unix_python3_mkl_cpu(lib_name) {
         ws('workspace/ut-python3-cpu') {
           try {
             utils.unpack_and_init(lib_name, mx_lib, true)
-            python3_ut('ubuntu_cpu')
+            python3_ut_serial('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('tests_unittest.xml', 'tests_python3_cpu_unittest.xml')

diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
@@ -183,6 +183,7 @@ class NaiveEngine final : public Engine {
     if (exec_ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
+      cudaGetLastError();  // reset cuda error
       MSHADOW_CATCH_ERROR(mshadow::SetDevice<gpu>(exec_ctx.dev_id));
       if (streams_.size() <= dev_id) {
         streams_.resize(dev_id + 1, nullptr);

diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
@@ -178,7 +178,7 @@ inline void BilinearSamplerForward(const Tensor<gpu, 4, DType> &output,
     cuda::BilinearSamplerForwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
       i_c, i_h, i_w, data, grid, o_n, o_c, o_h, o_w, out);
     // post kernel check
-    cudaError err = cudaPeekAtLastError();
+    cudaError err = cudaGetLastError();
     CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 }
 
@@ -217,7 +217,7 @@ inline void BilinearSamplerBackward(const Tensor<gpu, 4, DType> &input_grad,
     });
   });
   // post kernel check
-  cudaError err = cudaPeekAtLastError();
+  cudaError err = cudaGetLastError();
   CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 }
 

diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -183,7 +183,7 @@ namespace cuda {
                    no_trans, trans_std, sample_per_part, output_dim,
                    group_size, part_size, num_classes,
                    channels_each_class, top_data, top_count_data);
-    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
   }
 
 
@@ -354,7 +354,7 @@ namespace cuda {
                     bottom_data, bottom_rois, bottom_trans,
                     no_trans, trans_std, sample_per_part, group_size,
                     part_size, num_classes, channels_each_class);
-    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
   }
 
 }  // namespace cuda

diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
@@ -348,7 +348,7 @@ void _nms(mshadow::Stream<gpu> *s,
                                   nms_overlap_thresh,
                                   boxes_dev,
                                   mask_dev);
-  FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+  FRCNN_CUDA_CHECK(cudaGetLastError());
   std::vector<uint64_t> mask_host(boxes_num * col_blocks);
 
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
@@ -494,7 +494,7 @@ class MultiProposalGPUOp : public Operator{
     ProposalGridKernel<<<dimGrid, dimBlock>>>(
       count, num_anchors, height, width, param_.feature_stride,
       scores.dptr_, workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // Transform anchors and bbox_deltas into bboxes
     CheckLaunchParam(dimGrid, dimBlock, "BBoxPred");
@@ -507,13 +507,13 @@ class MultiProposalGPUOp : public Operator{
         count, num_anchors, height, width, param_.feature_stride, im_info.dptr_,
         workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
     }
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // filter boxes with less than rpn_min_size
     CheckLaunchParam(dimGrid, dimBlock, "FilterBox");
     FilterBoxKernel<<<dimGrid, dimBlock>>>(
       count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
 
 
@@ -541,15 +541,15 @@ class MultiProposalGPUOp : public Operator{
       CopyScoreKernel << <dimGrid, dimBlock >> >(
           count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
           score.dptr_, order.dptr_);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // argsort score, save order
       thrust::stable_sort_by_key(thrust::device,
           score.dptr_,
           score.dptr_ + score.size(0),
           order.dptr_,
           thrust::greater<real_t>());
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // Reorder proposals according to order
 
@@ -558,7 +558,7 @@ class MultiProposalGPUOp : public Operator{
       ReorderProposalsKernel << <dimGrid, dimBlock >> >(
           rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
           order.dptr_, workspace_ordered_proposals.dptr_);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // perform nms
       std::vector<int> _keep(workspace_ordered_proposals.size(0));
@@ -580,7 +580,7 @@ class MultiProposalGPUOp : public Operator{
           param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
           out.dptr_ + b * param_.rpn_post_nms_top_n * 5,
           out_score.dptr_ + b * param_.rpn_post_nms_top_n);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
     }
     // free temporary memory
     FRCNN_CUDA_CHECK(cudaFree(keep));

diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
@@ -230,7 +230,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
     num_classes, num_anchors, threshold, clip,
     variances[0], variances[1], variances[2], variances[3],
     nms_threshold, force_suppress, nms_topk);
-  MULTIBOX_DETECTION_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow
 

diff --git a/src/operator/contrib/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
@@ -90,7 +90,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
       sizes[i], ratio, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
     ++offset;
   }
-  MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());
 
   // size = sizes[0], various ratios
   for (int j = 1; j < num_ratios; ++j) {
@@ -99,7 +99,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
        offset_y, offset_x, stride, offset);
     ++offset;
   }
-  MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow