diff --git a/.github/workflows/os_x_staticbuild.yml b/.github/workflows/os_x_staticbuild.yml
index 6727b22c8baf..7d4b479f08b4 100644
--- a/.github/workflows/os_x_staticbuild.yml
+++ b/.github/workflows/os_x_staticbuild.yml
@@ -22,4 +22,6 @@ jobs:
           python3 -m pip install --user -e python
       - name: Test project
         run: |
-          python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)'
+          python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'not test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
+          MXNET_ENGINE_TYPE=NaiveEngine python3 -m pytest -n 4 --durations=50 --verbose tests/python/unittest/ -k 'test_operator and not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'not serial'
+          python3 -m pytest --durations=50 --verbose tests/python/unittest/ -k 'not (test_subgraph or test_custom_op or test_recordimage_dataset_with_data_loader_multiworker or test_multi_worker or test_multi_worker_shape or test_multi_worker_forked_data_loader or test_multi_worker_dataloader_release_pool)' -m 'serial'
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
old mode 100755
new mode 100644
index b3d917d89c06..845ed35cf24f
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -251,7 +251,7 @@ extern "C" {
     if (e == cudaErrorCudartUnloading) {                           \
       throw dmlc::Error(cudaGetErrorString(e));                    \
     }                                                              \
-    CHECK(e == cudaSuccess)                                        \
+    CHECK_EQ(e, cudaSuccess)                                       \
         << "CUDA: " << cudaGetErrorString(e);                      \
   }
 
diff --git a/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh b/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
old mode 100755
new mode 100644
index a64d786f0a6d..02a74b2ad46f
--- a/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
+++ b/3rdparty/mshadow/mshadow/cuda/tensor_gpu-inl.cuh
@@ -35,7 +35,7 @@
 #define MSHADOW_CUDA_POST_KERNEL_CHECK(x) \
   /* Code block avoids redefinition of cudaError_t err */ \
   do { \
-    cudaError err = cudaPeekAtLastError(); \
+    cudaError err = cudaGetLastError(); \
     CHECK_EQ(err, cudaSuccess) << "Name: " << #x << " ErrStr:" << cudaGetErrorString(err); \
   } while (0)
 namespace mshadow {
diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
old mode 100755
new mode 100644
diff --git a/3rdparty/mshadow/mshadow/tensor.h b/3rdparty/mshadow/mshadow/tensor.h
old mode 100755
new mode 100644
diff --git a/3rdparty/mshadow/mshadow/tensor_cpu-inl.h b/3rdparty/mshadow/mshadow/tensor_cpu-inl.h
old mode 100755
new mode 100644
diff --git a/3rdparty/mshadow/mshadow/tensor_gpu-inl.h b/3rdparty/mshadow/mshadow/tensor_gpu-inl.h
old mode 100755
new mode 100644
diff --git a/tests/python/unittest/test_metric_perf.py b/benchmark/python/metric/benchmark_metric.py
similarity index 100%
rename from tests/python/unittest/test_metric_perf.py
rename to benchmark/python/metric/benchmark_metric.py
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0cbba5786993..37c9a42233ab 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -944,11 +944,12 @@ cd_unittest_ubuntu() {
     export MXNET_ENABLE_CYTHON=0
     export CD_JOB=1 # signal this is a CD run so any unecessary tests can be skipped
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
 
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
-    pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
+    pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --verbose tests/python/unittest
     pytest -n 4 --durations=50 --verbose tests/python/quantization
 
@@ -958,11 +959,16 @@ cd_unittest_ubuntu() {
     # fi
 
     if [[ ${mxnet_variant} = cu* ]]; then
-        pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/gpu
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        MXNET_ENGINE_TYPE=NaiveEngine \
+            pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/gpu
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+            pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/gpu
         pytest -m 'serial' --durations=50 --verbose tests/python/gpu
 
         # Adding these here as CI doesn't test all CUDA environments
-        pytest -n 4 example/image-classification/test_score.py
+        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+            pytest -n 4 example/image-classification/test_score.py
         # TODO(szha): fix and reenable the hanging issue. tracked in #18098
         # integrationtest_ubuntu_gpu_dist_kvstore
     fi
@@ -980,11 +986,26 @@ unittest_ubuntu_python3_cpu() {
     export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -n 4 --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
 }
 
+unittest_ubuntu_python3_cpu_serial() {
+    # TODO(szha): delete this and switch to unittest_ubuntu_python3_cpu once #18244 is fixed
+    set -ex
+    export PYTHONPATH=./python/
+    export MXNET_MKLDNN_DEBUG=0  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    export MXNET_SUBGRAPH_VERBOSE=0
+    export MXNET_ENABLE_CYTHON=0
+    export DMLC_LOG_STACK_TRACE_DEPTH=10
+    pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
+}
+
 unittest_ubuntu_python3_cpu_mkldnn() {
     set -ex
     export PYTHONPATH=./python/
@@ -993,9 +1014,9 @@ unittest_ubuntu_python3_cpu_mkldnn() {
     export MXNET_SUBGRAPH_VERBOSE=0
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
-    pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
-    pytest -n 4 --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
+    # TODO(szha): enable parallel testing and naive engine for ops once #18244 is fixed
+    pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    pytest --durations=50 --cov-report xml:tests_mkl.xml --verbose tests/python/mkl
 }
 
 unittest_ubuntu_python3_gpu() {
@@ -1007,8 +1028,11 @@ unittest_ubuntu_python3_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1022,9 +1046,12 @@ unittest_ubuntu_python3_gpu_cython() {
     export MXNET_ENABLE_CYTHON=1
     export MXNET_ENFORCE_CYTHON=1
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
     check_cython
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1036,8 +1063,11 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     export CUDNN_OFF_TEST_ONLY=true
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1050,9 +1080,9 @@ unittest_ubuntu_tensorrt_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
     python3 tests/python/tensorrt/lenet5_train.py
-    pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
     pytest -k 'not test_ops' --durations=50 --cov-report xml:tests_trt_gpu.xml --cov-append --verbose --capture=no tests/python/tensorrt/
 }
 
@@ -1070,8 +1100,8 @@ unittest_ubuntu_python3_quantization_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
 }
 
 unittest_centos7_cpu_scala() {
@@ -1213,7 +1243,9 @@ unittest_centos7_cpu() {
     set -ex
     source /opt/rh/rh-python36/enable
     cd /work/mxnet
-    python -m pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    python -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        python -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     python -m pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     python -m pytest -n 4 --durations=50 --cov-report xml:tests_train.xml --verbose tests/python/train
 }
@@ -1224,8 +1256,11 @@ unittest_centos7_gpu() {
     cd /work/mxnet
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    export MXNET_GPU_MEM_POOL_TYPE=Unpooled
-    pytest -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+        pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
+    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
@@ -1342,7 +1377,9 @@ test_ubuntu_cpu_python3() {
     cd /work/mxnet/python
     pip3 install -e .
     cd /work/mxnet
-    python3 -m pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
+    python3 -m pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/unittest
+    MXNET_ENGINE_TYPE=NaiveEngine \
+        python3 -m pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --verbose tests/python/unittest
     python3 -m pytest -m 'serial' --durations=50 --verbose tests/python/unittest
 
     popd
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 20024d2e47c7..8c8e0da979df 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -50,6 +50,12 @@ def python3_ut(docker_container_name) {
   }
 }
 
+def python3_ut_serial(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_serial', false)
+  }
+}
+
 def python3_ut_mkldnn(docker_container_name) {
   timeout(time: max_time, unit: 'MINUTES') {
     utils.docker_run(docker_container_name, 'unittest_ubuntu_python3_cpu_mkldnn', false)
@@ -803,7 +809,7 @@ def test_unix_python3_mkl_cpu(lib_name) {
         ws('workspace/ut-python3-cpu') {
           try {
             utils.unpack_and_init(lib_name, mx_lib, true)
-            python3_ut('ubuntu_cpu')
+            python3_ut_serial('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
             utils.collect_test_results_unix('tests_unittest.xml', 'tests_python3_cpu_unittest.xml')
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index e76003a8dca9..5e51199e3304 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -183,6 +183,7 @@ class NaiveEngine final : public Engine {
     if (exec_ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
       size_t dev_id = static_cast<size_t>(exec_ctx.dev_id);
+      cudaGetLastError();  // reset cuda error
       MSHADOW_CATCH_ERROR(mshadow::SetDevice<gpu>(exec_ctx.dev_id));
       if (streams_.size() <= dev_id) {
         streams_.resize(dev_id + 1, nullptr);
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 42262e19afc6..e8b1ce68847f 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -178,7 +178,7 @@ inline void BilinearSamplerForward(const Tensor<gpu, 4, DType> &output,
     cuda::BilinearSamplerForwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
       i_c, i_h, i_w, data, grid, o_n, o_c, o_h, o_w, out);
     // post kernel check
-    cudaError err = cudaPeekAtLastError();
+    cudaError err = cudaGetLastError();
     CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 }
 
@@ -217,7 +217,7 @@ inline void BilinearSamplerBackward(const Tensor<gpu, 4, DType> &input_grad,
     });
   });
   // post kernel check
-  cudaError err = cudaPeekAtLastError();
+  cudaError err = cudaGetLastError();
   CHECK_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 }
 
diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
index 50ad178e6e68..62680d1fb8d1 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cu
+++ b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -183,7 +183,7 @@ namespace cuda {
                    no_trans, trans_std, sample_per_part, output_dim,
                    group_size, part_size, num_classes,
                    channels_each_class, top_data, top_count_data);
-    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
   }
 
 
@@ -354,7 +354,7 @@ namespace cuda {
                     bottom_data, bottom_rois, bottom_trans,
                     no_trans, trans_std, sample_per_part, group_size,
                     part_size, num_classes, channels_each_class);
-    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaGetLastError());
   }
 
 }  // namespace cuda
diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
index 6ae886160d5d..a3f7d405e452 100644
--- a/src/operator/contrib/multi_proposal.cu
+++ b/src/operator/contrib/multi_proposal.cu
@@ -348,7 +348,7 @@ void _nms(mshadow::Stream<gpu> *s,
                                   nms_overlap_thresh,
                                   boxes_dev,
                                   mask_dev);
-  FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+  FRCNN_CUDA_CHECK(cudaGetLastError());
   std::vector<uint64_t> mask_host(boxes_num * col_blocks);
 
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
@@ -494,7 +494,7 @@ class MultiProposalGPUOp : public Operator{
     ProposalGridKernel<<<dimGrid, dimBlock>>>(
       count, num_anchors, height, width, param_.feature_stride,
       scores.dptr_, workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // Transform anchors and bbox_deltas into bboxes
     CheckLaunchParam(dimGrid, dimBlock, "BBoxPred");
@@ -507,13 +507,13 @@ class MultiProposalGPUOp : public Operator{
         count, num_anchors, height, width, param_.feature_stride, im_info.dptr_,
         workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
     }
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // filter boxes with less than rpn_min_size
     CheckLaunchParam(dimGrid, dimBlock, "FilterBox");
     FilterBoxKernel<<<dimGrid, dimBlock>>>(
       count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
 
 
@@ -541,7 +541,7 @@ class MultiProposalGPUOp : public Operator{
       CopyScoreKernel << <dimGrid, dimBlock >> >(
           count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
           score.dptr_, order.dptr_);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // argsort score, save order
       thrust::stable_sort_by_key(thrust::device,
@@ -549,7 +549,7 @@ class MultiProposalGPUOp : public Operator{
           score.dptr_ + score.size(0),
           order.dptr_,
           thrust::greater<real_t>());
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // Reorder proposals according to order
 
@@ -558,7 +558,7 @@ class MultiProposalGPUOp : public Operator{
       ReorderProposalsKernel << <dimGrid, dimBlock >> >(
           rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
           order.dptr_, workspace_ordered_proposals.dptr_);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
 
       // perform nms
       std::vector<int> _keep(workspace_ordered_proposals.size(0));
@@ -580,7 +580,7 @@ class MultiProposalGPUOp : public Operator{
           param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
           out.dptr_ + b * param_.rpn_post_nms_top_n * 5,
           out_score.dptr_ + b * param_.rpn_post_nms_top_n);
-      FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+      FRCNN_CUDA_CHECK(cudaGetLastError());
     }
     // free temporary memory
     FRCNN_CUDA_CHECK(cudaFree(keep));
diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
index bc02834b4584..cd2afd243aec 100644
--- a/src/operator/contrib/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -230,7 +230,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
     num_classes, num_anchors, threshold, clip,
     variances[0], variances[1], variances[2], variances[3],
     nms_threshold, force_suppress, nms_topk);
-  MULTIBOX_DETECTION_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOX_DETECTION_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow
 
diff --git a/src/operator/contrib/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
index f2b60a3b6237..4c3ebb016219 100644
--- a/src/operator/contrib/multibox_prior.cu
+++ b/src/operator/contrib/multibox_prior.cu
@@ -90,7 +90,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
       sizes[i], ratio, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
     ++offset;
   }
-  MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());
 
   // size = sizes[0], various ratios
   for (int j = 1; j < num_ratios; ++j) {
@@ -99,7 +99,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
        offset_y, offset_x, stride, offset);
     ++offset;
   }
-  MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOXPRIOR_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow
 
diff --git a/src/operator/contrib/multibox_target.cu b/src/operator/contrib/multibox_target.cu
index adec904e17f3..ffa1a132c419 100644
--- a/src/operator/contrib/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -368,7 +368,7 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
   cuda::CheckLaunchParam(init_block_dim, init_thread_dim, "MultiBoxTarget Init");
   cuda::InitGroundTruthFlags<DType><<<init_block_dim, init_thread_dim>>>(
     gt_flags, labels.dptr_, num_batches, num_labels, label_width);
-  MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
 
   // compute best matches
   temp_space[2] = -1.f;
@@ -379,14 +379,14 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
   cuda::CheckLaunchParam(num_batches, num_threads, "MultiBoxTarget Matching");
   cuda::FindBestMatches<DType><<<num_batches, num_threads>>>(best_matches,
     gt_flags, anchor_flags, overlaps, num_anchors, num_labels);
-  MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
 
   // find good matches with overlap > threshold
   if (overlap_threshold > 0) {
     cuda::FindGoodMatches<DType><<<num_batches, num_threads>>>(best_matches,
       anchor_flags, overlaps, num_anchors, num_labels,
       overlap_threshold);
-    MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+    MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
   }
 
   // do negative mining or not
@@ -398,20 +398,20 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
       cls_preds.dptr_, anchor_flags, buffer, negative_mining_ratio,
       negative_mining_thresh, minimum_negative_samples,
       num_anchors, num_labels, num_classes);
-    MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+    MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
   } else {
     int num_blocks = (num_batches * num_anchors - 1) / num_threads + 1;
     cuda::CheckLaunchParam(num_blocks, num_threads, "MultiBoxTarget Negative");
     cuda::UseAllNegatives<DType><<<num_blocks, num_threads>>>(anchor_flags,
       num_batches * num_anchors);
-    MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+    MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
   }
 
   cuda::AssignTrainigTargets<DType><<<num_batches, num_threads>>>(
     loc_target.dptr_, loc_mask.dptr_, cls_target.dptr_, anchor_flags,
     best_matches, labels.dptr_, anchors.dptr_, num_anchors, num_labels,
     label_width, variances[0], variances[1], variances[2], variances[3]);
-  MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
+  MULTIBOX_TARGET_CUDA_CHECK(cudaGetLastError());
 }
 }  // namespace mshadow
 
diff --git a/src/operator/contrib/proposal.cu b/src/operator/contrib/proposal.cu
index 9d90398db364..dc45b3cdef7b 100644
--- a/src/operator/contrib/proposal.cu
+++ b/src/operator/contrib/proposal.cu
@@ -329,7 +329,7 @@ void _nms(mshadow::Stream<gpu> *s,
                                   nms_overlap_thresh,
                                   boxes_dev,
                                   mask_dev);
-  FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+  FRCNN_CUDA_CHECK(cudaGetLastError());
   std::vector<uint64_t> mask_host(boxes_num * col_blocks);
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
   FRCNN_CUDA_CHECK(cudaMemcpyAsync(&mask_host[0],
@@ -471,7 +471,7 @@ class ProposalGPUOp : public Operator{
     ProposalGridKernel<<<dimGrid, dimBlock>>>(
       count, num_anchors, height, width, param_.feature_stride,
       scores.dptr_, workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // im_info is small, we want to copy them to cpu
     std::vector<float> cpu_im_info(3);
@@ -499,13 +499,13 @@ class ProposalGPUOp : public Operator{
         cpu_im_info[0], cpu_im_info[1],
         workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
     }
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // filter boxes with less than rpn_min_size
     CheckLaunchParam(dimGrid, dimBlock, "FilterBox");
     FilterBoxKernel<<<dimGrid, dimBlock>>>(
       count, param_.rpn_min_size * cpu_im_info[2], workspace_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // Copy score to a continuous memory
     float* score_ptr = nullptr;
@@ -518,7 +518,7 @@ class ProposalGPUOp : public Operator{
     CheckLaunchParam(dimGrid, dimBlock, "CopyScore");
     CopyScoreKernel<<<dimGrid, dimBlock>>>(
       count, workspace_proposals.dptr_, score.dptr_, order.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // argsort score, save order
     thrust::stable_sort_by_key(thrust::device,
@@ -526,7 +526,7 @@ class ProposalGPUOp : public Operator{
                                score.dptr_ + score.size(0),
                                order.dptr_,
                                thrust::greater<real_t>());
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // Reorder proposals according to order
     float* workspace_ordered_proposals_ptr = nullptr;
@@ -539,7 +539,7 @@ class ProposalGPUOp : public Operator{
     CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
     ReorderProposalsKernel<<<dimGrid, dimBlock>>>(
       rpn_pre_nms_top_n, workspace_proposals.dptr_, order.dptr_, workspace_ordered_proposals.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr));
     FRCNN_CUDA_CHECK(cudaFree(score_ptr));
@@ -566,7 +566,7 @@ class ProposalGPUOp : public Operator{
     PrepareOutput<<<dimGrid, dimBlock>>>(
       param_.rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size,
       out.dptr_, out_score.dptr_);
-    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    FRCNN_CUDA_CHECK(cudaGetLastError());
 
     // free temporary memory
     FRCNN_CUDA_CHECK(cudaFree(keep));
diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu
index c5f229148aff..8765eb95b72e 100644
--- a/src/operator/contrib/psroi_pooling.cu
+++ b/src/operator/contrib/psroi_pooling.cu
@@ -134,7 +134,7 @@ inline void PSROIPoolForward(const Tensor<gpu, 4, DType> &out,
     kBaseThreadNum, 0, stream >> >(
       count, bottom_data, spatial_scale, channels, height, width,
       pooled_height, pooled_width, bottom_rois, output_dim_, group_size_, top_data);
-  PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+  PSROIPOOLING_CUDA_CHECK(cudaGetLastError());
 }
 
 
@@ -231,7 +231,7 @@ inline void PSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
     kBaseThreadNum, 0, stream >> >(
       count, top_diff, num_rois, spatial_scale, channels, height, width,
       pooled_height, pooled_width, group_size_, output_dim_, bottom_diff, bottom_rois);
-  PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+  PSROIPOOLING_CUDA_CHECK(cudaGetLastError());
 }
 
 }  // namespace cuda
diff --git a/src/operator/correlation.cu b/src/operator/correlation.cu
index 117dc61af6bf..e0b1bd8625ec 100644
--- a/src/operator/correlation.cu
+++ b/src/operator/correlation.cu
@@ -476,7 +476,7 @@ void Forward_gpu(
             stride1_, stride2_,
             width, height, channels,
             rbot1, rbot2, top);
-        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        CORRELATION_CUDA_CHECK(cudaGetLastError());
     } else {
         //  CorrelationLayer
         for (int n = 0; n < num; n++) {
@@ -489,7 +489,7 @@ void Forward_gpu(
                 max_displacement_, neighborhood_grid_radius_,
                 neighborhood_grid_width_, kernel_radius_,
                 stride1_, stride2_, width, height, channels, rbot1, rbot2, top);
-         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+         CORRELATION_CUDA_CHECK(cudaGetLastError());
         }
     }
 }
@@ -534,7 +534,7 @@ void Backward_gpu(
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             bottom0_diff, rbot2, top_diff);
-        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        CORRELATION_CUDA_CHECK(cudaGetLastError());
         }
         //  == Run kernel Backward 1
         for (int n = 0; n < num; n++) {
@@ -545,7 +545,7 @@ void Backward_gpu(
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             rbot1, bottom1_diff, top_diff);
-       CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+       CORRELATION_CUDA_CHECK(cudaGetLastError());
         }
     } else  {
         for (int n = 0; n < num; n++) {
@@ -557,7 +557,7 @@ void Backward_gpu(
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             bottom0_diff, rbot1, rbot2, top_diff);
-        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        CORRELATION_CUDA_CHECK(cudaGetLastError());
         }
         for (int n = 0; n < num; n++) {
         //  Bottom1:
@@ -568,7 +568,7 @@ void Backward_gpu(
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             rbot1, rbot2, bottom1_diff, top_diff);
-        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
+        CORRELATION_CUDA_CHECK(cudaGetLastError());
         }
     }
 }
diff --git a/src/operator/tensor/pseudo2DTranspose_op-inl.cuh b/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
index b3ca9fbfa0c9..5898c0bcf07c 100644
--- a/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
+++ b/src/operator/tensor/pseudo2DTranspose_op-inl.cuh
@@ -165,7 +165,7 @@ inline void call_transpose_pseudo2D(index_t cTypeSize,
     default:
       LOG(FATAL) << "Unsupported type combination. " << "Copy type size = " << cTypeSize;
   }
-  auto cuErr = cudaPeekAtLastError();
+  auto cuErr = cudaGetLastError();
   CHECK_EQ(cuErr, cudaSuccess) << "TransposePseudo2D kernel failure: "
                                << cudaGetErrorString(cuErr) << ". "
                                << "block: (" << block.x << "," << block.y << "," << block.z << ")"
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index a78a5cdf5971..d0754ea6293c 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -41,7 +41,6 @@ using namespace mxnet::runtime;
 
 struct MXNetRuntimeEntry {
   std::string ret_str;
-  std::string last_error;
   MXNetByteArray ret_bytes;
 };
 
diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py
index 466c01019575..b3b0d536af24 100644
--- a/tests/nightly/estimator/test_estimator_cnn.py
+++ b/tests/nightly/estimator/test_estimator_cnn.py
@@ -30,7 +30,7 @@
 # use with_seed decorator in python/unittest/common.py
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'python', 'unittest'))
 from common import with_seed
-import unittest
+import pytest
 
 
 def load_data_mnist(batch_size, resize=None, num_workers=4):
@@ -127,7 +127,7 @@ def test_estimator_cpu():
 
 # using fixed seed to reduce flakiness in accuracy assertion
 @with_seed(7)
-@unittest.skipIf(mx.context.num_gpus() < 1, "skip if no GPU")
+@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="skip if no GPU")
 def test_estimator_gpu():
     '''
     Test estimator by training resnet18_v1 for 5 epochs on MNIST and verify accuracy
diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py
index 7d3561db3789..69380389d48e 100644
--- a/tests/nightly/estimator/test_sentiment_rnn.py
+++ b/tests/nightly/estimator/test_sentiment_rnn.py
@@ -35,7 +35,7 @@
 # use with_seed decorator in python/unittest/common.py
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'python', 'unittest'))
 from common import with_seed
-import unittest
+import pytest
 
 
 class TextCNN(nn.Block):
@@ -243,7 +243,7 @@ def test_estimator_cpu():
 
 # using fixed seed to reduce flakiness in accuracy assertion
 @with_seed(7)
-@unittest.skipIf(mx.context.num_gpus() < 1, "skip if no GPU")
+@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="skip if no GPU")
 def test_estimator_gpu():
     '''
     Test estimator by training Bidirectional RNN for 5 epochs on the IMDB dataset
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 4082c5ebc691..0c9c6f905dd6 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -28,7 +28,6 @@
 from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_context, check_symbolic_forward, create_2d_tensor
 from mxnet import gluon, nd
 from common import with_seed
-import unittest
 import pytest
 
 
@@ -148,8 +147,8 @@ def np_softmax(x, axis=-1, temperature=1.0):
         x /= np.sum(x, axis=axis, keepdims=True)
         return x
 
-    @unittest.skip("log_softmax flaky, tracked at "
-                   "https://github.com/apache/incubator-mxnet/issues/17397")
+    @pytest.mark.skip(reason="log_softmax flaky, tracked at "
+                      "https://github.com/apache/incubator-mxnet/issues/17397")
     def check_log_softmax():
         ndim = 2
         shape = (SMALL_Y, LARGE_X)
@@ -612,8 +611,8 @@ def check_ndarray_random_uniform():
         a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
         assert a[-1][0] != 0
 
-    @unittest.skip("Randint flaky, tracked at "
-                   "https://github.com/apache/incubator-mxnet/issues/16172")
+    @pytest.mark.skip(reason="Randint flaky, tracked at "
+                      "https://github.com/apache/incubator-mxnet/issues/16172")
     @with_seed()
     def check_ndarray_random_randint():
         a = nd.random.randint(100, 10000, shape=(LARGE_X, SMALL_Y))
@@ -826,8 +825,8 @@ def check_pick():
         res = mx.nd.pick(a, b)
         assert res.shape == b.shape
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, "
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, "
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_depthtospace():
         def numpy_depth_to_space(x, blocksize):
             b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
@@ -845,8 +844,8 @@ def numpy_depth_to_space(x, blocksize):
         output = mx.nd.depth_to_space(data, 2)
         assert_almost_equal(output.asnumpy(), expected, atol=1e-3, rtol=1e-3)
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, "
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, "
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_spacetodepth():
         def numpy_space_to_depth(x, blocksize):
             b, c, h, w = x.shape[0], x.shape[1], x.shape[2], x.shape[3]
@@ -910,8 +909,8 @@ def check_unravel_index():
                                          shape=(LARGE_X, SMALL_Y))
         assert (indices_2d.asnumpy() == np.array(original_2d_indices)).all()
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_transpose():
         check_dtypes = [np.float32, np.int64]
         for dtype in check_dtypes:
@@ -921,16 +920,16 @@ def check_transpose():
             ref_out = np.transpose(b.asnumpy())
             assert_almost_equal(t.asnumpy(), ref_out, rtol=1e-10)
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_swapaxes():
         b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
         t = nd.swapaxes(b, dim1=0, dim2=1)
         assert np.sum(t[:, -1].asnumpy() == (LARGE_X - 1)) == b.shape[1]
         assert t.shape == (SMALL_Y, LARGE_X)
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_flip():
         b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
         t = nd.flip(b, axis=0)
@@ -1227,16 +1226,16 @@ def check_argmin():
         idx = mx.nd.argmin(a, axis=0)
         assert idx.shape[0] == SMALL_Y
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_argsort():
         b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
         s = nd.argsort(b, axis=0, is_ascend=False, dtype=np.int64)
         mx.nd.waitall()
         assert (s[0].asnumpy() == (LARGE_X - 1)).all()
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_sort():
         b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
         s = nd.sort(b, axis=0, is_ascend=False)
@@ -1244,8 +1243,8 @@ def check_sort():
         s = nd.sort(b, is_ascend=False)
         assert np.sum(s[0].asnumpy() == 0).all()
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_topk():
         b = create_2d_tensor(rows=LARGE_X, columns=SMALL_Y)
         k = nd.topk(b, k=10, axis=0, dtype=np.int64)
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index 2fcd3e944cf2..74f015cabf0f 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -28,7 +28,6 @@
 from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, create_vector
 from mxnet import gluon, nd
 from common import with_seed
-import unittest
 import pytest
 
 
@@ -186,8 +185,7 @@ def check_ndarray_random_uniform():
         a = nd.random.uniform(shape=LARGE_X)
         assert a[-1] != 0
 
-    @unittest.skip("Randint flaky, tracked at "
-                   "https://github.com/apache/incubator-mxnet/issues/16172")
+    @pytest.mark.skip(reason="Randint flaky, tracked at https://github.com/apache/incubator-mxnet/issues/16172")
     @with_seed()
     def check_ndarray_random_randint():
         # check if randint can generate value greater than 2**32 (large)
@@ -485,15 +483,15 @@ def check_argmin():
         assert idx[0] == 0
         assert idx.shape[0] == 1
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_argsort():
         a = create_vector(size=LARGE_X)
         s = nd.argsort(a, axis=0, is_ascend=False, dtype=np.int64)
         assert s[0] == (LARGE_X - 1)
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_sort():
         a = create_vector(size=LARGE_X)
 
@@ -508,8 +506,8 @@ def check_ascend(x):
         check_descend(a)
         check_ascend(a)
 
-    @unittest.skip("Memory doesn't free up after stacked execution with other ops, " +
-                   "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
+    @pytest.mark.skip(reason="Memory doesn't free up after stacked execution with other ops, " +
+                      "tracked at https://github.com/apache/incubator-mxnet/issues/17411")
     def check_topk():
         a = create_vector(size=LARGE_X)
         ind = nd.topk(a, k=10, axis=0, dtype=np.int64)
diff --git a/tests/python/gpu/test_device.py b/tests/python/gpu/test_device.py
index cd8145c3deac..52e09c029b49 100644
--- a/tests/python/gpu/test_device.py
+++ b/tests/python/gpu/test_device.py
@@ -17,7 +17,7 @@
 
 import mxnet as mx
 import numpy as np
-import unittest
+import pytest
 import os
 import logging
 
@@ -35,7 +35,7 @@
 
 gpus = range(1, 1+num_gpus)
 
-@unittest.skipIf(mx.context.num_gpus() < 1, "test_device_pushpull needs at least 1 GPU")
+@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="test_device_pushpull needs at least 1 GPU")
 def test_device_pushpull():
     def check_dense_pushpull(kv_type):
         for shape, key in zip(shapes, keys):
diff --git a/tests/python/gpu/test_extensions_gpu.py b/tests/python/gpu/test_extensions_gpu.py
index 8315b49660f3..18368e755b6c 100644
--- a/tests/python/gpu/test_extensions_gpu.py
+++ b/tests/python/gpu/test_extensions_gpu.py
@@ -19,20 +19,20 @@
 
 import os
 import platform
-import unittest
 import mxnet as mx
 import numpy as np
 from mxnet import nd
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
+import pytest
 
 base_path = os.path.join(os.path.dirname(__file__), "../../..")
 def check_platform():
     return platform.machine() not in ['x86_64', 'AMD64']
 
-@unittest.skipIf(check_platform(), "not all machine types supported")
-@unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
+@pytest.mark.skipif(check_platform(), reason="not all machine types supported")
+@pytest.mark.skipif(is_cd_run(), reason="continuous delivery run - ignoring test")
 def test_custom_op_gpu():
     # possible places to find library file
     if (os.name=='posix'):
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 32b4f2e2de19..a777546327ae 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -582,7 +582,7 @@ def _test_bulking(test_bulking_func):
         .format(fully_bulked_time - fastest_half_bulked_time, times_str)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
 def test_bulking_gluon_gpu():
     _test_bulking(_test_bulking_in_process)
 
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 275dae009a21..136ced2aac73 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -17,8 +17,8 @@
 
 import mxnet as mx
 import numpy as np
-import unittest
 import os
+import pytest
 
 shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
 keys = [1,2,3,4,5,6,7]
@@ -32,7 +32,7 @@
 
 gpus = range(1, 1+num_gpus)
 
-@unittest.skip("Test requires NCCL library installed and enabled during build")
+@pytest.mark.skip(reason="Test requires NCCL library installed and enabled during build")
 def test_nccl_pushpull():
     for shape, key in zip(shapes, keys):
         for n_gpus in gpus:
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index fb019bd56fe5..e92c2960d42a 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -22,7 +22,6 @@
 import multiprocessing as mp
 import mxnet as mx
 import numpy as np
-import unittest
 import pytest
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose
 from mxnet.base import MXNetError
@@ -647,7 +646,7 @@ def check_consistency_NxM(sym_list, ctx_list):
     check_consistency(np.repeat(sym_list, len(ctx_list)), ctx_list * len(sym_list), scale=0.5)
 
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10141")
+@pytest.mark.skip(reason="test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10141")
 @with_seed()
 @pytest.mark.serial
 def test_convolution_options():
@@ -767,7 +766,7 @@ def _conv_with_num_streams(seed):
                 raise
 
 
-@unittest.skip("skipping for now due to severe flakiness")
+@pytest.mark.skip(reason="skipping for now due to severe flakiness")
 @with_seed()
 def test_convolution_multiple_streams():
     for num_streams in [1, 2]:
@@ -1635,6 +1634,8 @@ def test_lrn():
 
 
 @with_seed()
+@pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
+                    reason="Testing with naive engine consistently triggers illegal memory access. Tracked in #17713")
 def test_embedding_with_type():
     def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
         NVD = [[20, 10, 20], [200, 10, 300]]
@@ -2466,12 +2467,12 @@ def _test_bulking_in_process(seed, time_per_iteration):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/16517')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/16517')
 def test_bulking_operator_gpu():
     _test_bulking(_test_bulking_in_process)
 
 
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/14970')
 def test_bulking():
     # test case format: (max_fwd_segment_size, max_bwd_segment_size, enable_bulking_in_training)
     test_cases = [(0,0,True), (1,1,True), (15,15,False), (15,0,True), (0,15,True), (15,15,True)]
diff --git a/tests/python/mkl/test_bf16_operator.py b/tests/python/mkl/test_bf16_operator.py
index 888b5d20b908..4b1f75ec7dc5 100644
--- a/tests/python/mkl/test_bf16_operator.py
+++ b/tests/python/mkl/test_bf16_operator.py
@@ -32,7 +32,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import with_seed
-import unittest
+import pytest
 
 bfloat16 = np.dtype([('bfloat16', np.uint16)])
 
@@ -202,7 +202,7 @@ def test_bf16_elemwiseadd():
 
     check_operator_accuracy(sym_fp32, sym_bf16, dshape, num_input_data=2, bf16_use_fp32_params=True)
 
-@unittest.skip("env dependent, need check further.")
+@pytest.mark.skip(reason="env dependent, need check further.")
 @with_seed()
 def test_bf16_concat():
     dshape = rand_shape_nd(4)
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 82519a11d919..2ca788f2dce7 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -22,7 +22,7 @@
 import os
 import numpy as np
 import mxnet as mx
-import unittest
+import pytest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
 from mxnet.module import Module
 from mxnet import gluon
@@ -446,7 +446,7 @@ def check_convolution_training(stype):
 
 
 @with_seed()
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
 def test_Deconvolution():
     def check_Deconvolution_training(stype):
         for shape in [(3, 3, 10), (3, 3, 10, 10)]:
diff --git a/tests/python/profiling/test_nvtx.py b/tests/python/profiling/test_nvtx.py
index a80e33ec03b0..5cf708b7a700 100644
--- a/tests/python/profiling/test_nvtx.py
+++ b/tests/python/profiling/test_nvtx.py
@@ -15,19 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 import os
-import unittest
+import pytest
 
 import mxnet as mx
 import sys
 
 from subprocess import Popen, PIPE
 
-
+@pytest.mark.skipif(not mx.context.num_gpus(), reason='Test only applicable to machines with GPUs')
 def test_nvtx_ranges_present_in_profile():
 
-    if not mx.context.num_gpus():
-        unittest.skip('Test only applicable to machines with GPUs')
-
     # Build a system independent wrapper to execute simple_forward with nvprof
     # This requires nvprof to be on your path (which should be the case for most GPU workstations with cuda installed).
     simple_forward_path = os.path.realpath(__file__)
diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py
index 8c25742bd74c..6e9f9b5ba22b 100644
--- a/tests/python/unittest/test_contrib_svrg_module.py
+++ b/tests/python/unittest/test_contrib_svrg_module.py
@@ -20,7 +20,7 @@
 from common import with_seed, assertRaises
 from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule
 from mxnet.test_utils import *
-import unittest
+import pytest
 
 def setup():
     train_data = np.random.randint(1, 5, [1000, 2])
@@ -94,7 +94,7 @@ def test_module_bind():
     assert mod._mod_aux.binded == True
 
 
-@unittest.skip("Flaky test https://gitsvrhub.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://gitsvrhub.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_module_save_load(tmpdir):
     import os
@@ -133,7 +133,7 @@ def test_module_save_load(tmpdir):
     assert mod3._symbol.tojson() == mod4._symbol.tojson()
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_svrgmodule_reshape():
     data = mx.sym.Variable("data")
@@ -161,7 +161,7 @@ def test_svrgmodule_reshape():
     assert mod.get_outputs()[0].shape == dshape
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_update_full_grad():
     def create_network():
@@ -204,7 +204,7 @@ def create_network():
     assert same(full_grads_weights, svrg_mod._param_dict[0]['fc1_weight'])
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_svrg_with_sgd():
     def create_module_with_sgd():
@@ -268,7 +268,7 @@ def create_module_with_sgd():
     assert svrg_mse < sgd_mse
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_accumulate_kvstore():
     # Test KVStore behavior when push a list of values
@@ -292,7 +292,7 @@ def test_accumulate_kvstore():
     assert same(svrg_mod._param_dict[0]["fc1_weight"], b[0])
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
 @with_seed()
 def test_fit():
     di, mod = setup()
diff --git a/tests/python/unittest/test_engine.py b/tests/python/unittest/test_engine.py
index fafc6758d892..538e4b57ead8 100644
--- a/tests/python/unittest/test_engine.py
+++ b/tests/python/unittest/test_engine.py
@@ -17,8 +17,8 @@
 
 import mxnet as mx
 import os
-import unittest
 from mxnet.test_utils import EnvManager
+import pytest
 
 def test_bulk():
     with mx.engine.bulk(10):
@@ -32,7 +32,7 @@ def test_bulk():
             x += 1
     assert (x.asnumpy() == 104).all()
 
-@unittest.skip("OMP platform dependent")
+@pytest.mark.skip(reason="OMP platform dependent")
 def test_engine_openmp_after_fork():
     """
     Test that the number of max threads in the child is 1. After forking we should not use a bigger
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index 8657bec1398c..be4e643d6890 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
@@ -26,6 +27,8 @@
 
 
 @with_seed()
+@pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
+                    reason="This test assumes asynchronous execution.")
 def test_exc_imperative():
     def imperative(exec_numpy=True):
         a = mx.nd.random.normal(0, 1, (2, 2))
@@ -76,6 +79,8 @@ def symbolic(exec_backward=True, waitall=True):
     pytest.raises(MXNetError, symbolic, exec_backward=True, waitall=True)
 
 @with_seed()
+@pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
+                    reason="This test assumes asynchronous execution.")
 def test_exc_gluon():
     def gluon(exec_wait=True, waitall=False):
         model = nn.Sequential()
@@ -127,6 +132,8 @@ def multiple_waits(waitall=False):
     multiple_waits(waitall=True)
 
 @with_seed()
+@pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
+                    reason="This test assumes asynchronous execution.")
 def test_exc_post_fail():
     def post_fail(waitall=False):
         caught = False
diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index d00f1494e4d5..57dcaa0001bf 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -19,20 +19,20 @@
 
 import os
 import platform
-import unittest
 import mxnet as mx
 import numpy as np
 from mxnet import nd
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
+import pytest
 
 base_path = os.path.join(os.path.dirname(__file__), "../../..")
 def check_platform():
     return platform.machine() not in ['x86_64', 'AMD64']
 
-@unittest.skipIf(check_platform(), "not all machine types supported")
-@unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
+@pytest.mark.skipif(check_platform(), reason="not all machine types supported")
+@pytest.mark.skipif(is_cd_run(), reason="continuous delivery run - ignoring test")
 def test_custom_op():
     # possible places to find library file
     if (os.name=='posix'):
@@ -96,8 +96,8 @@ def test_custom_op():
     assert_almost_equal(in_grad_base[0].asnumpy(), in_grad1[0].asnumpy(), rtol=1e-3, atol=1e-3)
     assert_almost_equal(in_grad_base[0].asnumpy(), in_grad2[0].asnumpy(), rtol=1e-3, atol=1e-3)
 
-@unittest.skipIf(check_platform(), "not all machine types supported")
-@unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
+@pytest.mark.skipif(check_platform(), reason="not all machine types supported")
+@pytest.mark.skipIf(is_cd_run(), reason="continuous delivery run - ignoring test")
 def test_subgraph():
     # possible places to find library file
     if (os.name=='posix'):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 587be268deff..98773b238348 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -35,7 +35,6 @@
 from copy import deepcopy
 import warnings
 import json
-import unittest
 import random
 import tempfile
 
@@ -2050,10 +2049,9 @@ def check_layer_forward_withinput(net, x):
     mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-5, atol=1e-6)
 
 @with_seed()
-def test_conv2d_16c():
-    chn_list = [16, 256]
-    kernel_list = [1, 3]
-    kernel_list.append(224)
+@pytest.mark.parametrize('chn_num', [16, 256])
+@pytest.mark.parametrize('kernel', [1, 3, 224])
+def test_conv2d_16c(chn_num, kernel):
     batch_size = 4
     class Net(gluon.HybridBlock):
         def __init__(self,
@@ -2069,10 +2067,8 @@ def hybrid_forward(self, F, x):
             return out
 
     x = mx.nd.random.uniform(-1.0, 1.0, shape=(batch_size, 3, 224, 224))
-    for i in range(len(chn_list)):
-        for j in range(len(kernel_list)):
-            net = Net(chn_list[i], kernel_list[j])
-            check_layer_forward_withinput(net, x)
+    net = Net(chn_num, kernel)
+    check_layer_forward_withinput(net, x)
 
 @with_seed()
 def test_group_conv2d_16c():
@@ -2103,7 +2099,7 @@ def hybrid_forward(self, F, x):
                 check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_deconv2d_16c():
     in_chn_list = [1024, 512, 256, 128, 64, 32, 16]
     out_chn_list = [512, 256, 128, 64, 32, 16, 3]
@@ -2127,7 +2123,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_batchnorm_16c():
     chn_list = [16, 1024]
     shape = np.random.randint(low=1, high=300, size=10)
@@ -2211,7 +2207,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_conv_reshape_conv():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
@@ -2270,7 +2266,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_slice_conv_reshape_conv():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
@@ -2450,7 +2446,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_batchnorm():
     class Net(gluon.HybridBlock):
         def __init__(self, shape, **kwargs):
@@ -2497,7 +2493,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_slice_batchnorm_slice_batchnorm():
     class Net(gluon.HybridBlock):
@@ -2524,7 +2520,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_batchnorm_reshape_batchnorm():
     class Net(gluon.HybridBlock):
         def __init__(self, shape, **kwargs):
@@ -2578,7 +2574,7 @@ def hybrid_forward(self, F, x):
 
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_batchnorm_slice_batchnorm():
     class Net(gluon.HybridBlock):
         def __init__(self, shape, slice, **kwargs):
@@ -2605,7 +2601,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_pooling2d():
     max_pooling = nn.MaxPool2D(strides=(2, 3), padding=(1, 1))
     avg_pooling = nn.AvgPool2D(strides=(2, 2), padding=(1, 1))
@@ -2673,7 +2669,7 @@ def hybrid_forward(self, F, x):
             check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_pooling2d_reshape_pooling2d():
     max_pooling = nn.MaxPool2D(strides=(2, 2), padding=(1, 1))
     avg_pooling = nn.AvgPool2D(strides=(2, 2), padding=(1, 1))
@@ -2745,7 +2741,7 @@ def hybrid_forward(self, F, x):
             check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_slice_pooling2d_reshape_pooling2d():
     max_pooling = nn.MaxPool2D(strides=(2, 3), padding=(1, 1))
     avg_pooling = nn.AvgPool2D(strides=(2, 2), padding=(1, 1))
@@ -2782,7 +2778,7 @@ def hybrid_forward(self, F, x):
             check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_reshape_pooling2d_slice_pooling2d():
     max_pooling = nn.MaxPool2D(strides=(2, 3), padding=(1, 1))
@@ -2822,7 +2818,7 @@ def hybrid_forward(self, F, x):
             check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_reshape_deconv():
     class Net(gluon.HybridBlock):
@@ -2842,7 +2838,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_slice_deconv():
     class Net(gluon.HybridBlock):
@@ -2862,7 +2858,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_reshape_deconv_reshape_deconv():
     class Net(gluon.HybridBlock):
@@ -2886,7 +2882,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_slice_deconv_slice_deconv():
     class Net(gluon.HybridBlock):
@@ -2910,7 +2906,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_reshape_deconv_slice_deconv():
     class Net(gluon.HybridBlock):
@@ -2936,7 +2932,7 @@ def hybrid_forward(self, F, x):
     check_layer_forward_withinput(net, x)
 
 @with_seed()
-@unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
+@pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_slice_deconv_reshape_deconv():
     class Net(gluon.HybridBlock):
diff --git a/tests/python/unittest/test_init.py b/tests/python/unittest/test_init.py
index 290f84b18781..1a86c828e9d2 100644
--- a/tests/python/unittest/test_init.py
+++ b/tests/python/unittest/test_init.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import json
-import unittest
+import pytest
 
 import mxnet as mx
 import numpy as np
@@ -48,7 +48,7 @@ def test_aux_init():
     assert (mod.get_params()[1]['bn_moving_var'].asnumpy() == 1).all()
     assert (mod.get_params()[1]['bn_moving_mean'].asnumpy() == 0).all()
 
-@unittest.skip("rsp const init is broken: https://github.com/apache/incubator-mxnet/issues/17988")
+@pytest.mark.skip(reason="rsp const init is broken: https://github.com/apache/incubator-mxnet/issues/17988")
 def test_rsp_const_init():
     def check_rsp_const_init(init, val):
         shape = (10, 10)
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 7941eec5004d..d8fb02e9ad66 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -27,6 +27,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, "../train"))
 from test_bucketing import train_model, prepare_bucketing_data
+import pytest
 
 
 @with_seed()
@@ -177,7 +178,11 @@ def test_module_layout():
 
 
 @with_seed()
-def test_save_load():
+@pytest.mark.parametrize('ctx,get_updater', [
+    (mx.cpu(), lambda m: m._updater),
+    ([mx.cpu(0), mx.cpu(1)], lambda m: m._kvstore._updater)
+])
+def test_save_load(ctx, get_updater, tmpdir):
     previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
     os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
@@ -188,40 +193,26 @@ def dict_equ(a, b):
     sym = mx.sym.Variable('data')
     sym = mx.sym.FullyConnected(sym, num_hidden=100)
 
-    # single device
-    mod = mx.mod.Module(sym, ('data',))
+    path = str(tmpdir.join('test'))
+    mod = mx.mod.Module(sym, ('data',), context=ctx)
     mod.bind(data_shapes=[('data', (10, 10))])
     mod.init_params()
     mod.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
     mod.update()
-    mod.save_checkpoint('test', 0, save_optimizer_states=True)
+    mod.save_checkpoint(path, 0, save_optimizer_states=True)
 
-    mod2 = mx.mod.Module.load('test', 0, load_optimizer_states=True, data_names=('data',))
+    mod2 = mx.mod.Module.load(path, 0, load_optimizer_states=True, data_names=('data',))
     mod2.bind(data_shapes=[('data', (10, 10))])
     mod2.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
     assert mod._symbol.tojson() == mod2._symbol.tojson()
     dict_equ(mod.get_params()[0], mod2.get_params()[0])
-    dict_equ(mod._updater.states, mod2._updater.states)
+    dict_equ(get_updater(mod).states, mod2._updater.states)
 
-    # multi device
-    mod = mx.mod.Module(sym, ('data',), context=[mx.cpu(0), mx.cpu(1)])
-    mod.bind(data_shapes=[('data', (10, 10))])
-    mod.init_params()
-    mod.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
-    mod.update()
-    mod.save_checkpoint('test', 0, save_optimizer_states=True)
-
-    mod2 = mx.mod.Module.load('test', 0, load_optimizer_states=True, data_names=('data',))
-    mod2.bind(data_shapes=[('data', (10, 10))])
-    mod2.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
-    assert mod._symbol.tojson() == mod2._symbol.tojson()
-    dict_equ(mod.get_params()[0], mod2.get_params()[0])
-    dict_equ(mod._kvstore._updater.states, mod2._updater.states)
     os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 
 @with_seed()
-def test_bucketing_save_load():
+def test_bucketing_save_load(tmpdir):
     previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
     os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
@@ -260,10 +251,12 @@ def sym_gen(seq_len):
 
         return loss, ('data',), ('softmax_label',)
 
+    path = str(tmpdir.join('test'))
+
     model = train_model(context=mx.current_context())
-    model.save_checkpoint("test", 0)
+    model.save_checkpoint(path, 0)
     data_train, data_val = prepare_bucketing_data(buckets, len_vocab, batch_size, invalid_label, num_sentence)
-    mod2 = mx.mod.BucketingModule.load('test', 0, sym_gen=sym_gen,
+    mod2 = mx.mod.BucketingModule.load(path, 0, sym_gen=sym_gen,
                                        default_bucket_key=data_train.default_bucket_key)
 
     mod2.bind(data_shapes=data_train.provide_data,
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index b43d0e1433fc..e6db131ac382 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -20,7 +20,6 @@
 from __future__ import division
 import itertools
 import os
-import unittest
 import pytest
 import numpy as _np
 import mxnet as mx
@@ -1132,162 +1131,179 @@ def test_np_multinomial():
 
 
 @with_seed()
-@unittest.skipUnless(is_op_runnable(), "Comparison ops can only run on either CPU instances, or GPU instances with"
-                                       " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
 @use_np
-@unittest.skip("NumpyBooleanAssignForwardCPU broken: https://github.com/apache/incubator-mxnet/issues/17990")
-@pytest.mark.serial
-def test_np_ndarray_boolean_indexing():
-    def test_single_bool_index():
-        # adapted from numpy's test_indexing.py
-        # Single boolean index
-        a = np.array([[1, 2, 3],
-                      [4, 5, 6],
-                      [7, 8, 9]], dtype=np.int32)
-        assert same(a[np.array(True, dtype=np.bool_)].asnumpy(), a[None].asnumpy())
-        assert same(a[np.array(False, dtype=np.bool_)].asnumpy(), a[None][0:0].asnumpy())
-
-    def test_boolean_catch_exception():
-        # adapted from numpy's test_indexing.py
-        arr = np.ones((5, 4, 3))
-
-        index = np.array([True], dtype=np.bool_)
-        assert_exception(arr.__getitem__, IndexError, index)
-
-        index = np.array([False] * 6, dtype=np.bool_)
-        assert_exception(arr.__getitem__, IndexError, index)
-
-        index = np.zeros((4, 4), dtype=bool)
-        assert_exception(arr.__getitem__, IndexError, index)
-
-    def test_boolean_indexing_onedim():
-        # adapted from numpy's test_indexing.py
-        # Indexing a 2-dimensional array with
-        # boolean array of length one
-        a = np.array([[0.,  0.,  0.]])
-        b = np.array([True], dtype=bool)
-        assert same(a[b].asnumpy(), a.asnumpy())
-
-    def test_boolean_indexing_twodim():
-        # adapted from numpy's test_indexing.py
-        # Indexing a 2-dimensional array with
-        # 2-dimensional boolean array
-        a = np.array([[1, 2, 3],
-                      [4, 5, 6],
-                      [7, 8, 9]], dtype=np.int32)
-        b = np.array([[ True, False,  True],
-                      [False,  True, False],
-                      [ True, False,  True]], dtype=np.bool_)
-        assert same(a[b].asnumpy(), _np.array([1, 3, 5, 7, 9], dtype=a.dtype))
-        assert same(a[b[1]].asnumpy(), _np.array([[4, 5, 6]], dtype=a.dtype))
-        assert same(a[b[0]].asnumpy(), a[b[2]].asnumpy())
-
-    def test_boolean_indexing_list():
-        # adapted from numpy's test_indexing.py
-        a = np.array([1, 2, 3], dtype=np.int32)
-        b = [True, False, True]
-        # Two variants of the test because the first takes a fast path
-        assert same(a[b].asnumpy(), _np.array([1, 3], dtype=a.dtype))
-        (a[None, b], [[1, 3]])
-
-    def test_boolean_indexing_tuple():
-        # case arr[:, mask, :] and arr[1, mask, 0]
-        # when a boolean array is in a tuple
-        a = np.array([[[0, 1],
-                       [2, 3]],
-                      [[4, 5],
-                       [6, 7]]], dtype=np.int32)
-        b = np.array([[False,True],
-                      [True,False]],dtype=np.bool)
-        _np_a = a.asnumpy()
-        _np_b = b.asnumpy()
-        assert same(a[:, b].asnumpy(), _np_a[:, _np_b])
-        assert same(a[b, :].asnumpy(), _np_a[_np_b, :])
-        assert same(a[0, b].asnumpy(), _np_a[0, _np_b])
-        assert same(a[b, 1].asnumpy(), _np_a[_np_b, 1])
-
-        a = np.arange(12).reshape(4,3)
-        b = np.array([1.,2.,3.])
-        _np_a = a.asnumpy()
-        _np_b = b.asnumpy()
-        assert same(a[:, b > 2].shape, _np_a[:, _np_b > 2].shape)
-        assert same(a[:, b > 2].asnumpy(), _np_a[:, _np_b > 2])
-
-        a = np.array([[1,2,3],[3,4,5]])
-        _np_a = a.asnumpy()
-        assert same(a[:,a[1,:] > 0].shape, _np_a[:,_np_a[1,: ] > 0].shape)
-        assert same(a[:,a[1,:] > 0].asnumpy(), _np_a[:,_np_a[1,: ] > 0])
-
-        a = np.ones((3,2), dtype='bool')
-        b = np.array([1,2,3])
-        _np_a = a.asnumpy()
-        _np_b = b.asnumpy()
-        assert same(a[b > 1].asnumpy(), _np_a[_np_b > 1])
-
-    def test_boolean_indexing_assign():
-        # test boolean indexing assign
-        shape = (3, 2, 3)
-        mx_data = np.random.uniform(size=shape)
-        mx_mask = np.array([[False,True], [True,False], [True,False]],dtype=np.bool)
-        np_data = mx_data.asnumpy()
-        np_mask = mx_mask.asnumpy()
-
-        np_data[np_data>0.5] = 0
-        mx_data[mx_data>0.5] = 0
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-        np_data[np_mask] = 1
-        mx_data[mx_mask] = 1
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-        np_data[np_mask, 1] = 2
-        mx_data[mx_mask, 1] = 2
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-        np_data[np_mask, :] = 3
-        mx_data[mx_mask, :] = 3
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-        mx_mask = np.array([[False,True, True],[False, True,False]],dtype=np.bool)
-        np_mask = mx_mask.asnumpy()
-
-        np_data[0, np_mask] = 5
-        mx_data[0, mx_mask] = 5
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-        np_data[:, np_mask] = 6
-        mx_data[:, mx_mask] = 6
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-        np_data[0, True, True, np_mask] = 7
-        mx_data[0, True, True, mx_mask] = 7
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-        np_data[False, 1] = 8
-        mx_data[False, 1] = 8
-        assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
-
-    def test_boolean_indexing_autograd():
-        a = np.random.uniform(size=(3, 4, 5))
-        a.attach_grad()
-        with mx.autograd.record():
-            out_mx = a[a < 0.5]
-        out_mx.backward()
-
-        a_np = a.asnumpy()
-        out_np = a_np[a_np < 0.5]
-        assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-4, atol=1e-5, use_broadcast=False)
-
-        a_grad_np = _np.zeros(a.shape, dtype=a.dtype)
-        a_grad_np[a_np < 0.5] = 1
-        assert_almost_equal(a.grad.asnumpy(), a_grad_np, rtol=1e-4, atol=1e-5, use_broadcast=False)
-
-    test_single_bool_index()
-    test_boolean_catch_exception()
-    test_boolean_indexing_onedim()
-    test_boolean_indexing_twodim()
-    test_boolean_indexing_list()
-    test_boolean_indexing_tuple()
-    test_boolean_indexing_assign()
-    test_boolean_indexing_autograd()
+def test_boolean_index_single():
+    # adapted from numpy's test_indexing.py
+    # Single boolean index
+    a = np.array([[1, 2, 3],
+                  [4, 5, 6],
+                  [7, 8, 9]], dtype=np.int32)
+    assert same(a[np.array(True, dtype=np.bool_)].asnumpy(), a[None].asnumpy())
+    assert same(a[np.array(False, dtype=np.bool_)].asnumpy(), a[None][0:0].asnumpy())
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_catch_exception():
+    # adapted from numpy's test_indexing.py
+    arr = np.ones((5, 4, 3))
+
+    index = np.array([True], dtype=np.bool_)
+    assert_exception(arr.__getitem__, IndexError, index)
+
+    index = np.array([False] * 6, dtype=np.bool_)
+    assert_exception(arr.__getitem__, IndexError, index)
+
+    index = np.zeros((4, 4), dtype=bool)
+    assert_exception(arr.__getitem__, IndexError, index)
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_onedim():
+    # adapted from numpy's test_indexing.py
+    # Indexing a 2-dimensional array with
+    # boolean array of length one
+    a = np.array([[0.,  0.,  0.]])
+    b = np.array([True], dtype=bool)
+    assert same(a[b].asnumpy(), a.asnumpy())
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_twodim():
+    # adapted from numpy's test_indexing.py
+    # Indexing a 2-dimensional array with
+    # 2-dimensional boolean array
+    a = np.array([[1, 2, 3],
+                  [4, 5, 6],
+                  [7, 8, 9]], dtype=np.int32)
+    b = np.array([[ True, False,  True],
+                  [False,  True, False],
+                  [ True, False,  True]], dtype=np.bool_)
+    assert same(a[b].asnumpy(), _np.array([1, 3, 5, 7, 9], dtype=a.dtype))
+    assert same(a[b[1]].asnumpy(), _np.array([[4, 5, 6]], dtype=a.dtype))
+    assert same(a[b[0]].asnumpy(), a[b[2]].asnumpy())
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_list():
+    # adapted from numpy's test_indexing.py
+    a = np.array([1, 2, 3], dtype=np.int32)
+    b = [True, False, True]
+    # Two variants of the test because the first takes a fast path
+    assert same(a[b].asnumpy(), _np.array([1, 3], dtype=a.dtype))
+    (a[None, b], [[1, 3]])
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_tuple():
+    # case arr[:, mask, :] and arr[1, mask, 0]
+    # when a boolean array is in a tuple
+    a = np.array([[[0, 1],
+                   [2, 3]],
+                  [[4, 5],
+                   [6, 7]]], dtype=np.int32)
+    b = np.array([[False,True],
+                  [True,False]],dtype=np.bool)
+    _np_a = a.asnumpy()
+    _np_b = b.asnumpy()
+    assert same(a[:, b].asnumpy(), _np_a[:, _np_b])
+    assert same(a[b, :].asnumpy(), _np_a[_np_b, :])
+    assert same(a[0, b].asnumpy(), _np_a[0, _np_b])
+    assert same(a[b, 1].asnumpy(), _np_a[_np_b, 1])
+
+    a = np.arange(12).reshape(4,3)
+    b = np.array([1.,2.,3.])
+    _np_a = a.asnumpy()
+    _np_b = b.asnumpy()
+    assert same(a[:, b > 2].shape, _np_a[:, _np_b > 2].shape)
+    assert same(a[:, b > 2].asnumpy(), _np_a[:, _np_b > 2])
+
+    a = np.array([[1,2,3],[3,4,5]])
+    _np_a = a.asnumpy()
+    assert same(a[:,a[1,:] > 0].shape, _np_a[:,_np_a[1,: ] > 0].shape)
+    assert same(a[:,a[1,:] > 0].asnumpy(), _np_a[:,_np_a[1,: ] > 0])
+
+    a = np.ones((3,2), dtype='bool')
+    b = np.array([1,2,3])
+    _np_a = a.asnumpy()
+    _np_b = b.asnumpy()
+    assert same(a[b > 1].asnumpy(), _np_a[_np_b > 1])
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+@pytest.mark.xfail(reason='Flaky boolean index assign. See #18334')
+def test_boolean_index_assign():
+    # test boolean indexing assign
+    shape = (3, 2, 3)
+    mx_data = np.random.uniform(size=shape)
+    mx_mask = np.array([[False,True], [True,False], [True,False]],dtype=np.bool)
+    np_data = mx_data.asnumpy()
+    np_mask = mx_mask.asnumpy()
+
+    np_data[np_data>0.5] = 0
+    mx_data[mx_data>0.5] = 0
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+    np_data[np_mask] = 1
+    mx_data[mx_mask] = 1
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+    np_data[np_mask, 1] = 2
+    mx_data[mx_mask, 1] = 2
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+    np_data[np_mask, :] = 3
+    mx_data[mx_mask, :] = 3
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+    mx_mask = np.array([[False,True, True],[False, True,False]],dtype=np.bool)
+    np_mask = mx_mask.asnumpy()
+
+    np_data[0, np_mask] = 5
+    mx_data[0, mx_mask] = 5
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+    np_data[:, np_mask] = 6
+    mx_data[:, mx_mask] = 6
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+    np_data[0, True, True, np_mask] = 7
+    mx_data[0, True, True, mx_mask] = 7
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+    np_data[False, 1] = 8
+    mx_data[False, 1] = 8
+    assert_almost_equal(mx_data.asnumpy(), np_data, rtol=1e-3, atol=1e-5, use_broadcast=False)
+
+@with_seed()
+@pytest.mark.skipif(not is_op_runnable(), reason="Comparison ops can only run on either CPU instances, or GPU instances with"
+                                                 " compute capability >= 53 if MXNet is built with USE_TVM_OP=ON")
+@use_np
+def test_boolean_index_autograd():
+    a = np.random.uniform(size=(3, 4, 5))
+    a.attach_grad()
+    with mx.autograd.record():
+        out_mx = a[a < 0.5]
+    out_mx.backward()
+
+    a_np = a.asnumpy()
+    out_np = a_np[a_np < 0.5]
+    assert_almost_equal(out_mx.asnumpy(), out_np, rtol=1e-4, atol=1e-5, use_broadcast=False)
+
+    a_grad_np = _np.zeros(a.shape, dtype=a.dtype)
+    a_grad_np[a_np < 0.5] = 1
+    assert_almost_equal(a.grad.asnumpy(), a_grad_np, rtol=1e-4, atol=1e-5, use_broadcast=False)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 165b0f23ed2e..624f2d5f8e24 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import
 from distutils.version import StrictVersion
 import sys
-import unittest
 import itertools
 import numpy as _np
 import platform
@@ -1326,7 +1325,6 @@ def hybrid_forward(self, F, a):
 
 @with_seed()
 @use_np
-@unittest.skip("NumpyBooleanAssignForwardCPU broken: https://github.com/apache/incubator-mxnet/issues/17990")
 def test_npx_batch_dot():
     ctx = mx.context.current_context()
     dtypes = ['float32', 'float64']
@@ -2560,7 +2558,7 @@ def hybrid_forward(self, F, a, b, *args, **kwargs):
 
 @with_seed()
 @use_np
-@pytest.mark.flaky()
+@pytest.mark.flaky(max_runs=3)
 def test_np_mixed_precision_binary_funcs():
     itypes = [np.bool, np.int8, np.int32, np.int64]
     ftypes = [np.float16, np.float32, np.float64]
@@ -5831,7 +5829,7 @@ class TestLstsq(HybridBlock):
         def __init__(self, rcond):
             super(TestLstsq, self).__init__()
             self._rcond = rcond
-        
+
         def hybrid_forward(self, F, a, b, rcond='warn'):
             return F.np.linalg.lstsq(a, b, rcond=self._rcond)
 
@@ -7228,10 +7226,10 @@ def __init__(self, n, k=0, m=None):
             if m is None:
                 m = n
             self._m = m
-        
+
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.tril_indices(n=self._n, k=self._k, m=self._m)
-    
+
     for n in _np.random.random_integers(-10, 50, 2):
         for k in _np.random.random_integers(-50, 50, 2):
             for m in _np.random.random_integers(-10, 50, 2):
@@ -7252,7 +7250,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
                         np_data[np_out] = -10
                         mx_data[mx_out] = -10
                         assert same(np_data, mx_data.asnumpy())
-                        
+
 
 @with_seed()
 @use_np
@@ -8012,7 +8010,7 @@ def hybrid_forward(self, F, a):
         a = np.random.uniform(-1.0, 1.0, size=a_shape)
         np_out = _np.median(a.asnumpy(), axis=axis, keepdims=keepdims)
         mx_out = test_median(a)
-        
+
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)
 
@@ -9029,10 +9027,10 @@ def __init__(self, left=None, right=None, period=None):
             self._left = left
             self._right = right
             self._period = period
-        
+
         def hybrid_forward(self, F, x, xp, fp):
             return F.np.interp(x, xp, fp, left=self._left, right=self._right, period=self._period)
-    
+
     class TestInterpScalar(HybridBlock):
         def __init__(self, x=None, left=None, right=None, period=None):
             super(TestInterpScalar, self).__init__()
@@ -9040,7 +9038,7 @@ def __init__(self, x=None, left=None, right=None, period=None):
             self._left = left
             self._right = right
             self._period = period
-        
+
         def hybrid_forward(self, F, xp, fp):
             return F.np.interp(self._x, xp, fp, left=self._left, right=self._right, period=self._period)
 
@@ -9067,13 +9065,13 @@ def hybrid_forward(self, F, xp, fp):
         else:
             x = np.random.uniform(0, 100, size=xshape).astype(xtype)
             xp = np.sort(np.random.choice(100, dsize, replace=False).astype(dtype))
-            fp = np.random.uniform(-50, 50, size=dsize).astype(dtype) 
+            fp = np.random.uniform(-50, 50, size=dsize).astype(dtype)
         np_x = x.asnumpy()
         if x_scalar and xshape == ():
             x = x.item()
             np_x = x
             test_interp = TestInterpScalar(x=x, left=left, right=right, period=period)
-        else: 
+        else:
             test_interp = TestInterp(left=left, right=right, period=period)
         if hybridize:
             test_interp.hybridize()
@@ -9461,7 +9459,7 @@ def __init__(self, axis=0, start=0):
             super(TestRollaxis, self).__init__()
             self._axis = axis
             self._start = start
-             
+
         def hybrid_forward(self, F, a, *args, **kwargs):
             return F.np.rollaxis(a, axis=self._axis, start=self._start)
 
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4ef3ff8a213a..c8c958165e0e 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -32,7 +32,6 @@
 from common import setup_module, with_seed, teardown_module, assert_raises_cudnn_not_satisfied, assert_raises_cuda_not_satisfied, assertRaises
 from common import run_in_spawned_process, xfail_when_nonstandard_decimal_separator
 import pytest
-import unittest
 import os
 
 def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e-4):
@@ -2084,7 +2083,7 @@ def test_convolution_grouping():
                 np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/14052")
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/14052")
 @with_seed()
 def test_depthwise_convolution():
     for dim in [1,2]:
@@ -4058,7 +4057,7 @@ def check_sequence_func(ftype, mask_value=0, axis=0):
 
 
 @with_seed()
-@unittest.skip("Flaky test: https://github.com/apache/incubator-mxnet/issues/11395")
+@pytest.mark.skip(reason="Flaky test: https://github.com/apache/incubator-mxnet/issues/11395")
 def test_sequence_last():
     check_sequence_func("last", axis=0)
     check_sequence_func("last", axis=1)
@@ -4581,7 +4580,40 @@ def test_blockgrad():
 
 
 @with_seed()
-def test_take():
+def test_take_autograd_req():
+    row_len = 2
+    col_len = 8
+    shape = (row_len, col_len)
+    sc = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype="float32")
+    sc.attach_grad()
+    i = mx.nd.array([0], dtype="int64")
+    j = mx.nd.array([0], dtype="int64")
+    with mx.autograd.record(train_mode=True):
+        xs = []
+        for _ in range(row_len):
+            x_i = []
+            for _ in range(col_len):
+                x_ij = sc.take(i).squeeze(axis=0).take(j).squeeze(axis=0)
+                x_i.append(x_ij)
+                j = j + 1
+            i = i + 1
+            j = j - col_len  # reset j
+            xs.append(mx.nd.stack(*x_i))
+        x = mx.nd.stack(*xs)
+        x = x.sum()
+
+    x.backward()
+    assert_almost_equal(np.ones(sc.grad.shape), sc.grad)
+
+@with_seed()
+@pytest.mark.parametrize('mode,out_of_range', [
+    ('clip', True),
+    ('wrap', True),
+    ('raise', False)
+])
+@pytest.mark.parametrize('data_ndim', range(1, 5))
+@pytest.mark.parametrize('idx_ndim', range(1, 4))
+def test_take(mode, out_of_range, data_ndim, idx_ndim):
     def grad_helper(grad_in, axis, idx):
         if axis == 0:
             if axis == len(grad_in.shape) - 1:
@@ -4608,89 +4640,55 @@ def grad_helper(grad_in, axis, idx):
         else:
             raise ValueError("axis %d is not supported..." % axis)
 
-    def check_output_n_grad(data_shape, idx_shape, axis, mode, out_of_range=True):
-        data = mx.sym.Variable('a')
-        idx = mx.sym.Variable('indices')
-        idx = mx.sym.BlockGrad(idx)
-        result = mx.sym.take(a=data, indices=idx, axis=axis, mode=mode)
-        exe = result.simple_bind(default_context(), a=data_shape,
-                                 indices=idx_shape, axis=axis, mode=mode)
-        data_real = np.random.normal(size=data_shape).astype('float32')
-        if out_of_range:
-            idx_real = np.random.randint(low=-data_shape[axis], high=data_shape[axis], size=idx_shape)
-            if mode == 'raise':
-                idx_real[idx_real == 0] = 1
-                idx_real *= data_shape[axis]
+    for axis in range(-data_ndim, data_ndim):
+        data_shape = ()
+        for _ in range(data_ndim):
+            data_shape += (np.random.randint(low=1, high=5), )
+        idx_shape = ()
+        for _ in range(idx_ndim):
+            idx_shape += (np.random.randint(low=1, high=5), )
+
+    data = mx.sym.Variable('a')
+    idx = mx.sym.Variable('indices')
+    idx = mx.sym.BlockGrad(idx)
+    result = mx.sym.take(a=data, indices=idx, axis=axis, mode=mode)
+    exe = result.simple_bind(default_context(), a=data_shape,
+                             indices=idx_shape, axis=axis, mode=mode)
+    data_real = np.random.normal(size=data_shape).astype('float32')
+    if out_of_range:
+        idx_real = np.random.randint(low=-data_shape[axis], high=data_shape[axis], size=idx_shape)
+        if mode == 'raise':
+            idx_real[idx_real == 0] = 1
+            idx_real *= data_shape[axis]
+    else:
+        idx_real = np.random.randint(low=0, high=data_shape[axis], size=idx_shape)
+    if axis < 0:
+        axis += len(data_shape)
+
+    grad_out = np.ones((data_shape[0:axis] if axis > 0 else ()) + idx_shape + (data_shape[axis+1:] if axis < len(data_shape) - 1 else ()), dtype='float32')
+    grad_in = np.zeros(data_shape, dtype='float32')
+
+    exe.arg_dict['a'][:] = mx.nd.array(data_real)
+    exe.arg_dict['indices'][:] = mx.nd.array(idx_real)
+    exe.forward(is_train=True)
+    if out_of_range and mode == 'raise':
+        try:
+            mx_out = exe.outputs[0].asnumpy()
+        except MXNetError as e:
+            return
         else:
-            idx_real = np.random.randint(low=0, high=data_shape[axis], size=idx_shape)
-        if axis < 0:
-            axis += len(data_shape)
+            # Did not raise exception
+            assert False, "did not raise %s" % MXNetError.__name__
 
-        grad_out = np.ones((data_shape[0:axis] if axis > 0 else ()) + idx_shape + (data_shape[axis+1:] if axis < len(data_shape) - 1 else ()), dtype='float32')
-        grad_in = np.zeros(data_shape, dtype='float32')
+    assert_almost_equal(exe.outputs[0], np.take(data_real, idx_real, axis=axis, mode=mode))
 
-        exe.arg_dict['a'][:] = mx.nd.array(data_real)
-        exe.arg_dict['indices'][:] = mx.nd.array(idx_real)
-        exe.forward(is_train=True)
-        if out_of_range and mode == 'raise':
-            try:
-                mx_out = exe.outputs[0].asnumpy()
-            except MXNetError as e:
-                return
-            else:
-                # Did not raise exception
-                assert False, "did not raise %s" % MXNetError.__name__
-
-        assert_almost_equal(exe.outputs[0], np.take(data_real, idx_real, axis=axis, mode=mode))
-
-        for i in np.nditer(idx_real):
-            if mode == 'clip':
-                i = np.clip(i, 0, data_shape[axis])
-            grad_helper(grad_in, axis, i)
-
-        exe.backward([mx.nd.array(grad_out)])
-        assert_almost_equal(exe.grad_dict['a'], grad_in)
-
-    def check_autograd_req():
-        row_len = 2
-        col_len = 8
-        shape = (row_len, col_len)
-        sc = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype="float32")
-        sc.attach_grad()
-        i = mx.nd.array([0], dtype="int64")
-        j = mx.nd.array([0], dtype="int64")
-        with mx.autograd.record(train_mode=True):
-            xs = []
-            for _ in range(row_len):
-                x_i = []
-                for _ in range(col_len):
-                    x_ij = sc.take(i).squeeze(axis=0).take(j).squeeze(axis=0)
-                    x_i.append(x_ij)
-                    j = j + 1
-                i = i + 1
-                j = j - col_len  # reset j
-                xs.append(mx.nd.stack(*x_i))
-            x = mx.nd.stack(*xs)
-            x = x.sum()
-
-        x.backward()
-        assert_almost_equal(np.ones(sc.grad.shape), sc.grad)
-
-    for mode in ['clip', 'wrap', 'raise']:
-        for data_ndim in range(1, 5):
-            for idx_ndim in range(1, 4):
-                for axis in range(-data_ndim, data_ndim):
-                    data_shape = ()
-                    for _ in range(data_ndim):
-                        data_shape += (np.random.randint(low=1, high=5), )
-                    idx_shape = ()
-                    for _ in range(idx_ndim):
-                        idx_shape += (np.random.randint(low=1, high=5), )
-                    if mode == 'raise':
-                        check_output_n_grad(data_shape, idx_shape, axis, 'raise', False)
-                    check_output_n_grad(data_shape, idx_shape, axis, mode)
-
-    check_autograd_req()
+    for i in np.nditer(idx_real):
+        if mode == 'clip':
+            i = np.clip(i, 0, data_shape[axis])
+        grad_helper(grad_in, axis, i)
+
+    exe.backward([mx.nd.array(grad_out)])
+    assert_almost_equal(exe.grad_dict['a'], grad_in)
 
 
 @with_seed()
@@ -6017,7 +6015,7 @@ def create_operator(self, ctx, shapes, dtypes):
         x = mx.nd.Custom(length=10, depth=10, op_type="no_input_op")
     assert_almost_equal(x, np.ones(shape=(10, 10), dtype=np.float32))
 
-@unittest.skip("Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/17467")
+@pytest.mark.skip(reason="Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/17467")
 @with_seed()
 def test_custom_op_fork():
     # test custom operator fork
@@ -6282,7 +6280,7 @@ def _validate_sample_location(input_rois, input_offset, spatial_scale, pooled_w,
 
     return output_offset
 
-@unittest.skip("Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/11713")
+@pytest.mark.skip(reason="Flaky test, tracked at https://github.com/apache/incubator-mxnet/issues/11713")
 @with_seed()
 def test_deformable_psroipooling():
     sample_per_part = 4
@@ -6925,7 +6923,7 @@ def test_laop_5():
 
 # Tests for linalg.inverse
 @with_seed()
-@unittest.skip("Test crashes https://github.com/apache/incubator-mxnet/issues/15975")
+@pytest.mark.skip(reason="Test crashes https://github.com/apache/incubator-mxnet/issues/15975")
 def test_laop_6():
     dtype = np.float64
     rtol_fw = 1e-7
@@ -7162,7 +7160,7 @@ def check_passthrough(ratio, shape, cudnn_off=True):
         # check_dropout_axes(0.25, nshape, axes = (1, 2, 3), cudnn_off=False)
 
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11290")
+@pytest.mark.skip(reason="test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11290")
 @with_seed()
 def test_scatter_gather_nd():
     def check(data, idx):
@@ -8359,7 +8357,7 @@ def get_output_names_callback(name, arr):
         del os.environ['MXNET_SUBGRAPH_BACKEND']
 
 @with_seed()
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/13915")
+@pytest.mark.skip(reason="test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/13915")
 def test_activation():
     shapes = [(9,), (9, 10), (9, 10, 10), (1, 9, 10, 10)]
     dtype_l = [np.float64, np.float32, np.float16]
diff --git a/tests/python/unittest/test_viz.py b/tests/python/unittest/test_viz.py
index 5c9b78a017d2..74f810ec6798 100644
--- a/tests/python/unittest/test_viz.py
+++ b/tests/python/unittest/test_viz.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import unittest
 import warnings
 
 import mxnet as mx
 import numpy as np
+import pytest
 
 
 def test_print_summary():
@@ -46,7 +46,7 @@ def graphviz_exists():
     else:
         return True
 
-@unittest.skipIf(not graphviz_exists(), "Skip test_plot_network as Graphviz could not be imported")
+@pytest.mark.skipif(not graphviz_exists(), reason="Skip test_plot_network as Graphviz could not be imported")
 def test_plot_network():
     # Test warnings for cyclic graph
     net = mx.sym.Variable('data')