Skip to content

Commit

Permalink
Removing the restriction of MAX_BINS for CUDA and OpenCL
Browse files Browse the repository at this point in the history
  • Loading branch information
pavanky authored and umar456 committed Aug 2, 2017
1 parent ee9613d commit a6cbbb8
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 21 deletions.
2 changes: 0 additions & 2 deletions src/backend/cuda/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ template<typename inType, typename outType, bool isLinear>
Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
{

ARG_ASSERT(1, (nbins<=kernel::MAX_BINS));

const dim4 dims = in.dims();
dim4 outDims = dim4(nbins, 1, dims[2], dims[3]);
Array<outType> out = createValueArray<outType>(outDims, outType(0));
Expand Down
31 changes: 23 additions & 8 deletions src/backend/cuda/kernel/histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,35 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
int end = minimum((start + THRD_LOAD * blockDim.x), len);
float step = (maxval-minval) / (float)nbins;

for (int i = threadIdx.x; i < nbins; i += blockDim.x)
shrdMem[i] = 0;
__syncthreads();
// If nbins > max shared memory allocated, then just use atomicAdd on global memory
bool use_global = nbins > MAX_BINS;

// Skip initializing shared memory
if (!use_global) {
for (int i = threadIdx.x; i < nbins; i += blockDim.x)
shrdMem[i] = 0;
__syncthreads();
}

for (int row = start; row < end; row += blockDim.x) {
int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
int bin = (int)((iptr[idx] - minval) / step);
bin = (bin < 0) ? 0 : bin;
bin = (bin >= nbins) ? (nbins-1) : bin;
atomicAdd((shrdMem + bin), 1);

if (use_global) {
atomicAdd((optr + bin), 1);
} else {
atomicAdd((shrdMem + bin), 1);
}
}
__syncthreads();

for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
atomicAdd((optr + i), shrdMem[i]);
// No need to write to global if use_global is true
if (!use_global) {
__syncthreads();
for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
atomicAdd((optr + i), shrdMem[i]);
}
}
}

Expand All @@ -72,7 +86,8 @@ void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, f

dim3 blocks(blk_x * in.dims[2], in.dims[3]);

int smem_size = nbins * sizeof(outType);
// If nbins > MAX_BINS, we are using global memory so smem_size can be 0;
int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0;

CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
out, in, nElems, nbins, minval, maxval, blk_x);
Expand Down
2 changes: 0 additions & 2 deletions src/backend/opencl/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ namespace opencl
template<typename inType, typename outType, bool isLinear>
Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
{
ARG_ASSERT(1, (nbins<=kernel::MAX_BINS));

const dim4 dims = in.dims();
dim4 outDims = dim4(nbins, 1, dims[2], dims[3]);
Array<outType> out = createValueArray<outType>(outDims, outType(0));
Expand Down
26 changes: 19 additions & 7 deletions src/backend/opencl/kernel/histogram.cl
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ void histogram(__global outType * d_dst,

float dx = (maxval-minval)/(float)nbins;

for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
localMem[i] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
bool use_global = nbins > MAX_BINS;

if (!use_global) {
for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
localMem[i] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int row = start; row < end; row += get_local_size(0)) {
#if defined(IS_LINEAR)
Expand All @@ -40,11 +44,19 @@ void histogram(__global outType * d_dst,
int bin = (int)(((float)in[idx] - minval) / dx);
bin = max(bin, 0);
bin = min(bin, (int)nbins-1);
atomic_inc((localMem + bin));

if (use_global) {
atomic_inc((out + bin));
} else {
atomic_inc((localMem + bin));
}

}
barrier(CLK_LOCAL_MEM_FENCE);

for (int i = get_local_id(0); i < nbins; i += get_local_size(0)) {
atomic_add((out + i), localMem[i]);
if (!use_global) {
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = get_local_id(0); i < nbins; i += get_local_size(0)) {
atomic_add((out + i), localMem[i]);
}
}
}
5 changes: 3 additions & 2 deletions src/backend/opencl/kernel/histogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ void histogram(Param out, const Param in, int nbins, float minval, float maxval)
std::ostringstream options;
options << " -D inType=" << dtype_traits<inType>::getName()
<< " -D outType=" << dtype_traits<outType>::getName()
<< " -D THRD_LOAD=" << THRD_LOAD;
<< " -D THRD_LOAD=" << THRD_LOAD
<< " -D MAX_BINS=" << MAX_BINS;
if (isLinear)
options << " -D IS_LINEAR";
if (std::is_same<inType, double>::value ||
Expand All @@ -66,7 +67,7 @@ void histogram(Param out, const Param in, int nbins, float minval, float maxval)

int nElems = in.info.dims[0]*in.info.dims[1];
int blk_x = divup(nElems, THRD_LOAD*THREADS_X);
int locSize = nbins * sizeof(outType);
int locSize = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 1;

NDRange local(THREADS_X, 1);
NDRange global(blk_x*in.info.dims[2]*THREADS_X, in.info.dims[3]);
Expand Down
28 changes: 28 additions & 0 deletions test/histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,31 @@ TEST(histogram, IndexedArray)
ASSERT_EQ(true, out[2] == 8);
ASSERT_EQ(true, out[3] == 8);
}

TEST(histogram, LargeBins)
{
const int max_val = 20000;
const int min_val = 0;
const int nbins = max_val / 2;
const int num = 1 << 20;
af::array A = af::round(max_val * af::randu(num) + min_val).as(u32);
af::eval(A);
af::array H = histogram(A, nbins, min_val, max_val);

std::vector<unsigned> hA(num);
A.host(hA.data());

std::vector<unsigned> hH(nbins);
H.host(hH.data());

int dx = (max_val - min_val) / nbins;
for (int i = 0; i < num; i++) {
int bin = (hA[i] - min_val) / dx;
bin = std::min(bin, nbins - 1);
hH[bin] -= 1;
}

for (int i = 0; i < nbins; i++) {
ASSERT_EQ(hH[i], 0u);
}
}

0 comments on commit a6cbbb8

Please sign in to comment.