Skip to content

Commit

Permalink
Re-enable texture object access in regions cuda kernel
Browse files Browse the repository at this point in the history
Remove pre-3.0-compute checks as we don't support 2.0 compute
capability anymore
  • Loading branch information
9prady9 committed Aug 26, 2017
1 parent cf521ad commit fcdedb0
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 50 deletions.
46 changes: 10 additions & 36 deletions src/backend/cuda/kernel/regions.hpp
Expand Up @@ -23,8 +23,6 @@
#include <thrust/sort.h>
#include <thrust/transform_scan.h>

#if __CUDACC__

static const int THREADS_X = 16;
static const int THREADS_Y = 16;

Expand All @@ -35,19 +33,18 @@ __device__ static int continue_flag = 1;

// Wrapper function for texture fetch
template<typename T>
__device__ __inline__
static T fetch(const int n,
cuda::Param<T> equiv_map,
cudaTextureObject_t tex)
static inline __device__
T fetch(const int n, cuda::Param<T> equiv_map, cudaTextureObject_t tex)
{
// FIXME: Enable capability >= 3.0
//#if (__CUDA_ARCH__ >= 300)
#if 0
// Kepler bindless texture objects
return tex1Dfetch<T>(tex, n);
#else
}

template<> __device__
STATIC_ double fetch<double>(const int n,
cuda::Param<double> equiv_map,
cudaTextureObject_t tex)
{
return equiv_map.ptr[n];
#endif
}

// The initial label kernel distinguishes between valid (nonzero)
Expand Down Expand Up @@ -128,7 +125,7 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
{

typedef warp_count<block_dim*block_dim> num_warps;
#if (__CUDA_ARCH__ >= 120) // This function uses warp ballot instructions

// Basic coordinates
const int base_x = (blockIdx.x * blockDim.x * n_per_thread) + threadIdx.x;
const int base_y = (blockIdx.y * blockDim.y * n_per_thread) + threadIdx.y;
Expand Down Expand Up @@ -160,13 +157,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
s_changed[warpIdx] = (T)0;
__syncthreads();

#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int xb = 0; xb < n_per_thread; ++xb) {
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int yb = 0; yb < n_per_thread; ++yb) {

// Indexing variables
Expand Down Expand Up @@ -251,9 +244,7 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
s_changed[warpIdx] = __any((int)tid_changed);
__syncthreads();

#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int i = 0; i < num_warps::value; i++)
continue_iter = continue_iter || (s_changed[i] != 0);

Expand All @@ -263,13 +254,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
// Reset whether or not this thread's pixels have changed.
tid_changed = false;

#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int xb = 0; xb < n_per_thread; ++xb) {
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int yb = 0; yb < n_per_thread; ++yb) {

// Indexing
Expand Down Expand Up @@ -335,22 +322,16 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
s_changed[warpIdx] = __any((int)tid_changed);
__syncthreads();
continue_iter = false;
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int i = 0; i < num_warps::value; i++)
continue_iter = continue_iter | (s_changed[i] != 0);

// If we have to continue iterating, update the tile of the
// equiv map in shared memory
if (continue_iter) {
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int xb = 0; xb < n_per_thread; ++xb) {
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int yb = 0; yb < n_per_thread; ++yb) {
const int tx = threadIdx.x + (xb * blockDim.x);
const int ty = threadIdx.y + (yb * blockDim.y);
Expand All @@ -364,13 +345,9 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
} // while (continue_iter)

// Write out equiv_map
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int xb = 0; xb < n_per_thread; ++xb) {
#if (__CUDA_ARCH__ >= 130)
#pragma unroll
#endif
for (int yb = 0; yb < n_per_thread; ++yb) {
const int x = base_x + (xb * blockDim.x);
const int y = base_y + (yb * blockDim.y);
Expand All @@ -382,7 +359,6 @@ static void update_equiv(cuda::Param<T> equiv_map, const cudaTextureObject_t tex
}
}
}
#endif // __CUDA_ARCH__ >= 120
}

template<typename T>
Expand Down Expand Up @@ -477,5 +453,3 @@ void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)

cuda::memFree(tmp);
}

#endif // __CUDACC__
41 changes: 27 additions & 14 deletions src/backend/cuda/regions.cu
Expand Up @@ -29,20 +29,28 @@ Array<T> regions(const Array<char> &in, af_connectivity connectivity)

// Create bindless texture object for the equiv map.
cudaTextureObject_t tex = 0;
// FIXME: Currently disabled, only supported on capaibility >= 3.0
//if (compute >= 3.0) {
// cudaResourceDesc resDesc;
// memset(&resDesc, 0, sizeof(resDesc));
// resDesc.resType = cudaResourceTypeLinear;
// resDesc.res.linear.devPtr = out->get();
// resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
// resDesc.res.linear.desc.x = 32; // bits per channel
// resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(float);
// cudaTextureDesc texDesc;
// memset(&texDesc, 0, sizeof(texDesc));
// texDesc.readMode = cudaReadModeElementType;
// CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
//}

//Use texture objects with compute 3.0 or higher
if (!std::is_same<T,double>::value) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = out.get();

if (std::is_signed<T>::value)
resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
else if (std::is_unsigned<T>::value)
resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;
else
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;

resDesc.res.linear.desc.x = sizeof(T)*8; // bits per channel
resDesc.res.linear.sizeInBytes = dims[0] * dims[1] * sizeof(T);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
CUDA_CHECK(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
}

switch(connectivity) {
case AF_CONNECTIVITY_4:
Expand All @@ -53,6 +61,11 @@ Array<T> regions(const Array<char> &in, af_connectivity connectivity)
break;
}

//Iterative procedure(while loop) in kernel::regions
//does stream synchronization towards loop end. So, it is
//safe to destroy the texture object
CUDA_CHECK(cudaDestroyTextureObject(tex));

return out;
}

Expand Down

0 comments on commit fcdedb0

Please sign in to comment.