Re-enable texture object access in regions cuda kernel

Remove pre-3.0-compute checks as we don't support 2.0 compute capability anymore
arrayfire · Aug 11, 2017 · 0db6081 · pavanky · Aug 11, 2017 · 0db6081
1 parent cf521ad
commit 0db6081
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 28 deletions.
diff --git a/src/backend/cuda/kernel/regions.hpp b/src/backend/cuda/kernel/regions.hpp
@@ -23,8 +23,6 @@
 #include <thrust/sort.h>
 #include <thrust/transform_scan.h>
 
-#if __CUDACC__
-
 static const int THREADS_X = 16;
 static const int THREADS_Y = 16;
 
@@ -35,19 +33,18 @@ __device__ static int continue_flag = 1;
 
 // Wrapper function for texture fetch
 template<typename T>
-__device__ __inline__
-static T fetch(const int n,
-               cuda::Param<T> equiv_map,
-               cudaTextureObject_t tex)
+static inline __device__
+T fetch(const int n, cuda::Param<T> equiv_map, cudaTextureObject_t tex)
 {
-// FIXME: Enable capability >= 3.0
-//#if (__CUDA_ARCH__ >= 300)
-#if 0
-    // Kepler bindless texture objects
     return tex1Dfetch<T>(tex, n);
-#else
+}
+
+template<> __device__
+STATIC_ double fetch<double>(const int n,
+                             cuda::Param<double> equiv_map,
+                             cudaTextureObject_t tex)
+{
     return equiv_map.ptr[n];
-#endif
 }
 
 // The initial label kernel distinguishes between valid (nonzero)
@@ -477,5 +474,3 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
 
     cuda::memFree(tmp);
 }
-
-#endif // __CUDACC__
diff --git a/src/backend/cuda/regions.cu b/src/backend/cuda/regions.cu
@@ -29,20 +29,28 @@ Array<T>  regions(const Array<char> &in, af_connectivity connectivity)
 
     // Create bindless texture object for the equiv map.
     cudaTextureObject_t tex = 0;
-    // FIXME: Currently disabled, only supported on capaibility >= 3.0
-    //if (compute >= 3.0) {
-    //    cudaResourceDesc resDesc;
-    //    memset(&resDesc, 0, sizeof(resDesc));
-    //    resDesc.resType = cudaResourceTypeLinear;
-    //    resDesc.res.linear.devPtr = out->get();
-    //    resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
-    //    resDesc.res.linear.desc.x = 32; // bits per channel
-    //    resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(float);
-    //    cudaTextureDesc texDesc;
-    //    memset(&texDesc, 0, sizeof(texDesc));
-    //    texDesc.readMode = cudaReadModeElementType;
-    //    CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-    //}
+
+    //Use texture objects with compute 3.0 or higher
+    if (!std::is_same<T,double>::value) {
+        cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType = cudaResourceTypeLinear;
+        resDesc.res.linear.devPtr = out.get();
+
+        if (std::is_signed<T>::value)
+            resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+        else if (std::is_unsigned<T>::value)
+            resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
+        else
+            resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
+
+        resDesc.res.linear.desc.x = sizeof(T)*8; // bits per channel
+        resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(T);
+        cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        texDesc.readMode = cudaReadModeElementType;
+        CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+    }
 
     switch(connectivity) {
         case AF_CONNECTIVITY_4:
@@ -53,6 +61,11 @@ Array<T>  regions(const Array<char> &in, af_connectivity connectivity)
             break;
     }
 
+    //Iterative procedure(while loop) in kernel::regions
+    //does stream synchronization towards loop end. So, it is
+    //safe to destroy the texture object
+    CUDA_CHECK(cudaDestroyTextureObject(tex));
+
     return out;
 }