Removing the restriction of MAX_BINS for CUDA and OpenCL

arrayfire · Aug 2, 2017 · a6cbbb8 · a6cbbb8
1 parent ee9613d
commit a6cbbb8
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 21 deletions.
diff --git a/src/backend/cuda/histogram.cu b/src/backend/cuda/histogram.cu
@@ -24,8 +24,6 @@ template<typename inType, typename outType, bool isLinear>
 Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
 {
 
-    ARG_ASSERT(1, (nbins<=kernel::MAX_BINS));
-
     const dim4 dims     = in.dims();
     dim4 outDims        = dim4(nbins, 1, dims[2], dims[3]);
     Array<outType> out  = createValueArray<outType>(outDims, outType(0));

diff --git a/src/backend/cuda/kernel/histogram.hpp b/src/backend/cuda/kernel/histogram.hpp
@@ -44,21 +44,35 @@ void histogramKernel(Param<outType> out, CParam<inType> in,
     int end    = minimum((start + THRD_LOAD * blockDim.x), len);
     float step = (maxval-minval) / (float)nbins;
 
-    for (int i = threadIdx.x; i < nbins; i += blockDim.x)
-        shrdMem[i] = 0;
-    __syncthreads();
+    // If nbins > max shared memory allocated, then just use atomicAdd on global memory
+    bool use_global = nbins > MAX_BINS;
+
+    // Skip initializing shared memory
+    if (!use_global) {
+        for (int i = threadIdx.x; i < nbins; i += blockDim.x)
+            shrdMem[i] = 0;
+        __syncthreads();
+    }
 
     for (int row = start; row < end; row += blockDim.x) {
         int idx = isLinear ? row : ((row % in.dims[0]) + (row / in.dims[0])*in.strides[1]);
         int bin = (int)((iptr[idx] - minval) / step);
         bin     = (bin < 0)      ? 0         : bin;
         bin     = (bin >= nbins) ? (nbins-1) : bin;
-        atomicAdd((shrdMem + bin), 1);
+
+        if (use_global) {
+            atomicAdd((optr + bin), 1);
+        } else {
+            atomicAdd((shrdMem + bin), 1);
+        }
     }
-    __syncthreads();
 
-    for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
-        atomicAdd((optr + i), shrdMem[i]);
+    // No need to write to global if use_global is true
+    if (!use_global) {
+        __syncthreads();
+        for (int i = threadIdx.x; i < nbins; i += blockDim.x) {
+            atomicAdd((optr + i), shrdMem[i]);
+        }
     }
 }
 
@@ -72,7 +86,8 @@ void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, f
 
     dim3 blocks(blk_x * in.dims[2], in.dims[3]);
 
-    int smem_size = nbins * sizeof(outType);
+    // If nbins > MAX_BINS, we are using global memory so smem_size can be 0;
+    int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0;
 
     CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
             out, in, nElems, nbins, minval, maxval, blk_x);

diff --git a/src/backend/opencl/histogram.cpp b/src/backend/opencl/histogram.cpp
@@ -23,8 +23,6 @@ namespace opencl
 template<typename inType, typename outType, bool isLinear>
 Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
 {
-    ARG_ASSERT(1, (nbins<=kernel::MAX_BINS));
-
     const dim4 dims     = in.dims();
     dim4 outDims        = dim4(nbins, 1, dims[2], dims[3]);
     Array<outType> out = createValueArray<outType>(outDims, outType(0));

diff --git a/src/backend/opencl/kernel/histogram.cl b/src/backend/opencl/kernel/histogram.cl
@@ -25,9 +25,13 @@ void histogram(__global outType *         d_dst,
 
     float dx = (maxval-minval)/(float)nbins;
 
-    for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
-        localMem[i] = 0;
-    barrier(CLK_LOCAL_MEM_FENCE);
+    bool use_global = nbins > MAX_BINS;
+
+    if (!use_global) {
+        for (int i = get_local_id(0); i < nbins; i += get_local_size(0))
+            localMem[i] = 0;
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
 
     for (int row = start; row < end; row += get_local_size(0)) {
 #if defined(IS_LINEAR)
@@ -40,11 +44,19 @@ void histogram(__global outType *         d_dst,
         int bin = (int)(((float)in[idx] - minval) / dx);
         bin     = max(bin, 0);
         bin     = min(bin, (int)nbins-1);
-        atomic_inc((localMem + bin));
+
+        if (use_global) {
+            atomic_inc((out + bin));
+        } else {
+            atomic_inc((localMem + bin));
+        }
+
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    for (int i = get_local_id(0); i < nbins; i += get_local_size(0)) {
-        atomic_add((out + i), localMem[i]);
+    if (!use_global) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (int i = get_local_id(0); i < nbins; i += get_local_size(0)) {
+            atomic_add((out + i), localMem[i]);
+        }
     }
 }
diff --git a/src/backend/opencl/kernel/histogram.hpp b/src/backend/opencl/kernel/histogram.hpp
@@ -43,7 +43,8 @@ void histogram(Param out, const Param in, int nbins, float minval, float maxval)
         std::ostringstream options;
         options << " -D inType=" << dtype_traits<inType>::getName()
             << " -D outType=" << dtype_traits<outType>::getName()
-            << " -D THRD_LOAD=" << THRD_LOAD;
+            << " -D THRD_LOAD=" << THRD_LOAD
+            << " -D MAX_BINS=" << MAX_BINS;
         if (isLinear)
             options << " -D IS_LINEAR";
         if (std::is_same<inType, double>::value ||
@@ -66,7 +67,7 @@ void histogram(Param out, const Param in, int nbins, float minval, float maxval)
 
     int nElems = in.info.dims[0]*in.info.dims[1];
     int blk_x  = divup(nElems, THRD_LOAD*THREADS_X);
-    int locSize = nbins * sizeof(outType);
+    int locSize = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 1;
 
     NDRange local(THREADS_X, 1);
     NDRange global(blk_x*in.info.dims[2]*THREADS_X, in.info.dims[3]);

diff --git a/test/histogram.cpp b/test/histogram.cpp
@@ -271,3 +271,31 @@ TEST(histogram, IndexedArray)
     ASSERT_EQ(true, out[2] ==  8);
     ASSERT_EQ(true, out[3] ==  8);
 }
+
+TEST(histogram, LargeBins)
+{
+    const int max_val = 20000;
+    const int min_val = 0;
+    const int nbins = max_val / 2;
+    const int num = 1 << 20;
+    af::array A = af::round(max_val * af::randu(num) + min_val).as(u32);
+    af::eval(A);
+    af::array H = histogram(A, nbins, min_val, max_val);
+
+    std::vector<unsigned> hA(num);
+    A.host(hA.data());
+
+    std::vector<unsigned> hH(nbins);
+    H.host(hH.data());
+
+    int dx = (max_val - min_val) / nbins;
+    for (int i = 0; i < num; i++) {
+        int bin = (hA[i] - min_val) / dx;
+        bin = std::min(bin, nbins - 1);
+        hH[bin] -= 1;
+    }
+
+    for (int i = 0; i < nbins; i++) {
+        ASSERT_EQ(hH[i], 0u);
+    }
+}