Re-enable texture object access in regions cuda kernel

Remove pre-3.0-compute checks as we don't support 2.0 compute capability anymore
arrayfire · Aug 26, 2017 · fcdedb0 · fcdedb0
1 parent cf521ad
commit fcdedb0
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 50 deletions.
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
@@ -23,8 +23,6 @@
 #include <thrust/sort.h>
 #include <thrust/transform_scan.h>
 
-#if __CUDACC__
-
 static const int THREADS_X = 16;
 static const int THREADS_Y = 16;
 
@@ -35,19 +33,18 @@ __device__ static int continue_flag = 1;
 
 // Wrapper function for texture fetch
 template<typename T>
-__device__ __inline__
-static T fetch(const int n,
-               cuda::Param<T> equiv_map,
-               cudaTextureObject_t tex)
+static inline __device__
+T fetch(const int n, cuda::Param<T> equiv_map, cudaTextureObject_t tex)
 {
-// FIXME: Enable capability >= 3.0
-//#if (__CUDA_ARCH__ >= 300)
-#if 0
-    // Kepler bindless texture objects
     return tex1Dfetch<T>(tex, n);
-#else
+}
+
+template<> __device__
+STATIC_ double fetch<double>(const int n,
+                             cuda::Param<double> equiv_map,
+                             cudaTextureObject_t tex)
+{
     return equiv_map.ptr[n];
-#endif
 }
 
 // The initial label kernel distinguishes between valid (nonzero)
@@ -128,7 +125,7 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
 {
 
     typedef warp_count<block_dim*block_dim> num_warps;
-#if (__CUDA_ARCH__ >= 120) // This function uses warp ballot instructions
+
     // Basic coordinates
     const int base_x = (blockIdx.x * blockDim.x * n_per_thread) + threadIdx.x;
     const int base_y = (blockIdx.y * blockDim.y * n_per_thread) + threadIdx.y;
@@ -160,13 +157,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
     s_changed[warpIdx] = (T)0;
     __syncthreads();
 
-#if (__CUDA_ARCH__ >= 130)
     #pragma unroll
-#endif
     for (int xb = 0; xb < n_per_thread; ++xb) {
-#if (__CUDA_ARCH__ >= 130)
         #pragma unroll
-#endif
         for (int yb = 0; yb < n_per_thread; ++yb) {
 
             // Indexing variables
@@ -251,9 +244,7 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
     s_changed[warpIdx] = __any((int)tid_changed);
     __syncthreads();
 
-#if (__CUDA_ARCH__ >= 130)
     #pragma unroll
-#endif
     for (int i = 0; i < num_warps::value; i++)
         continue_iter = continue_iter || (s_changed[i] != 0);
 
@@ -263,13 +254,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
         // Reset whether or not this thread's pixels have changed.
         tid_changed = false;
 
-#if (__CUDA_ARCH__ >= 130)
         #pragma unroll
-#endif
         for (int xb = 0; xb < n_per_thread; ++xb) {
-#if (__CUDA_ARCH__ >= 130)
             #pragma unroll
-#endif
             for (int yb = 0; yb < n_per_thread; ++yb) {
 
                 // Indexing
@@ -335,22 +322,16 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
         s_changed[warpIdx] = __any((int)tid_changed);
         __syncthreads();
         continue_iter = false;
-#if (__CUDA_ARCH__ >= 130)
         #pragma unroll
-#endif
         for (int i = 0; i < num_warps::value; i++)
             continue_iter = continue_iter | (s_changed[i] != 0);
 
         // If we have to continue iterating, update the tile of the
         // equiv map in shared memory
         if (continue_iter) {
-#if (__CUDA_ARCH__ >= 130)
             #pragma unroll
-#endif
             for (int xb = 0; xb < n_per_thread; ++xb) {
-#if (__CUDA_ARCH__ >= 130)
                 #pragma unroll
-#endif
                 for (int yb = 0; yb < n_per_thread; ++yb) {
                     const int tx = threadIdx.x + (xb * blockDim.x);
                     const int ty = threadIdx.y + (yb * blockDim.y);
@@ -364,13 +345,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
     } // while (continue_iter)
 
     // Write out equiv_map
-#if (__CUDA_ARCH__ >= 130)
     #pragma unroll
-#endif
     for (int xb = 0; xb < n_per_thread; ++xb) {
-#if (__CUDA_ARCH__ >= 130)
         #pragma unroll
-#endif
         for (int yb = 0; yb < n_per_thread; ++yb) {
             const int x = base_x + (xb * blockDim.x);
             const int y = base_y + (yb * blockDim.y);
@@ -382,7 +359,6 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
             }
         }
     }
-#endif // __CUDA_ARCH__ >= 120
 }
 
 template<typename T>
@@ -477,5 +453,3 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
 
     cuda::memFree(tmp);
 }
-
-#endif // __CUDACC__
diff --git a/src/backend/cuda/regions.cu b/src/backend/cuda/regions.cu
@@ -29,20 +29,28 @@ Array<T>  regions(const Array<char> &in, af_connectivity connectivity)
 
     // Create bindless texture object for the equiv map.
     cudaTextureObject_t tex = 0;
-    // FIXME: Currently disabled, only supported on capaibility >= 3.0
-    //if (compute >= 3.0) {
-    //    cudaResourceDesc resDesc;
-    //    memset(&resDesc, 0, sizeof(resDesc));
-    //    resDesc.resType = cudaResourceTypeLinear;
-    //    resDesc.res.linear.devPtr = out->get();
-    //    resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
-    //    resDesc.res.linear.desc.x = 32; // bits per channel
-    //    resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(float);
-    //    cudaTextureDesc texDesc;
-    //    memset(&texDesc, 0, sizeof(texDesc));
-    //    texDesc.readMode = cudaReadModeElementType;
-    //    CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-    //}
+
+    //Use texture objects with compute 3.0 or higher
+    if (!std::is_same<T,double>::value) {
+        cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType = cudaResourceTypeLinear;
+        resDesc.res.linear.devPtr = out.get();
+
+        if (std::is_signed<T>::value)
+            resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+        else if (std::is_unsigned<T>::value)
+            resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
+        else
+            resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
+
+        resDesc.res.linear.desc.x = sizeof(T)*8; // bits per channel
+        resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(T);
+        cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        texDesc.readMode = cudaReadModeElementType;
+        CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+    }
 
     switch(connectivity) {
         case AF_CONNECTIVITY_4:
@@ -53,6 +61,11 @@ Array<T>  regions(const Array<char> &in, af_connectivity connectivity)
             break;
     }
 
+    //Iterative procedure(while loop) in kernel::regions
+    //does stream synchronization towards loop end. So, it is
+    //safe to destroy the texture object
+    CUDA_CHECK(cudaDestroyTextureObject(tex));
+
     return out;
 }