diff --git a/src/backend/cuda/kernel/sift_nonfree.hpp b/src/backend/cuda/kernel/sift_nonfree.hpp index 56dc6817b0..e94aeb1377 100644 --- a/src/backend/cuda/kernel/sift_nonfree.hpp +++ b/src/backend/cuda/kernel/sift_nonfree.hpp @@ -333,8 +333,10 @@ __global__ void sub( { unsigned i = blockIdx.x * blockDim.x + threadIdx.x; - for (unsigned l = 0; l < n_layers; l++) - out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i]; + if (i < nel) { + for (unsigned l = 0; l < n_layers; l++) + out.ptr[l*nel + i] = in.ptr[(l+1)*nel + i] - in.ptr[l*nel + i]; + } } #define SCPTR(Y, X) (s_center[(Y) * s_i + (X)]) diff --git a/src/backend/opencl/kernel/sift_nonfree.cl b/src/backend/opencl/kernel/sift_nonfree.cl index f62ff37612..dc968d4f4d 100644 --- a/src/backend/opencl/kernel/sift_nonfree.cl +++ b/src/backend/opencl/kernel/sift_nonfree.cl @@ -255,8 +255,10 @@ __kernel void sub( { unsigned i = get_global_id(0); - for (unsigned l = 0; l < n_layers; l++) - out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i]; + if (i < nel) { + for (unsigned l = 0; l < n_layers; l++) + out[l*nel + i] = in[l*nel + i] - in[(l+1)*nel + i]; + } } #define LCPTR(Y, X) (l_center[(Y) * l_i + (X)])