Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Factorize CUDA_KERNEL_LOOP used in CUDA kernels #16197

Merged
merged 2 commits into from Sep 27, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/operator/contrib/count_sketch.cu
Expand Up @@ -33,10 +33,6 @@
#define WARPS_PER_BLOCK 1
#define THREADS_PER_BLOCK 512

#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
namespace mshadow {
namespace cuda {
// wrappers to deal with atomic add
Expand Down
4 changes: 0 additions & 4 deletions src/operator/contrib/deformable_psroi_pooling.cu
Expand Up @@ -38,10 +38,6 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
Expand Down
4 changes: 0 additions & 4 deletions src/operator/contrib/psroi_pooling.cu
Expand Up @@ -39,10 +39,6 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
Expand Down
11 changes: 4 additions & 7 deletions src/operator/contrib/roi_align.cu
Expand Up @@ -24,15 +24,12 @@
* Adapted from Caffe2
*/
#include "./roi_align-inl.h"
#include "../mxnet_op.h"


namespace mxnet {
namespace op {

#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)

using namespace mshadow::cuda;

// The maximum number of blocks to use in the default kernel call.
Expand Down Expand Up @@ -120,7 +117,7 @@ __global__ void RoIAlignForwardKernel(
const int sampling_ratio,
const T* bottom_rois,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
Expand Down Expand Up @@ -259,7 +256,7 @@ __global__ void RoIAlignBackwardKernel(
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
Expand Down Expand Up @@ -353,7 +350,7 @@ __global__ void RoIAlignBackwardKernel(
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // CUDA_KERNEL_LOOP
} // RoIAlignBackward

template<typename xpu>
Expand Down
6 changes: 2 additions & 4 deletions src/operator/correlation.cu
Expand Up @@ -28,6 +28,7 @@
#include <mshadow/cuda/reduce.cuh>
#include <algorithm>
#include <vector>
#include "./mxnet_op.h"

#define ROUND_OFF 50000
#define WARPS_PER_BLOCK 1
Expand All @@ -38,10 +39,7 @@
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

namespace mshadow {
namespace cuda {
// == Correlation Kernel
Expand Down