diff --git a/CMakeLists.txt b/CMakeLists.txt index 72514f25..d592ee66 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ # CMake below 3.4 does not work with CUDA separable compilation at all -cmake_minimum_required(VERSION 3.4) +cmake_minimum_required(VERSION 3.7) project(PopSift VERSION 1.0.0) @@ -72,7 +72,11 @@ endif() # Default setting of the CUDA CC versions to compile. # Shortening the lists saves a lot of compile time. # -if(CUDA_VERSION_MAJOR GREATER 7) +if(CUDA_VERSION_MAJOR GREATER_EQUAL 10) + set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72 75) +elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 9) + set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72) +elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 8) set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62) else() set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 ) @@ -91,7 +95,7 @@ endif() set(CUDA_SEPARABLE_COMPILATION ON) if(UNIX AND NOT APPLE) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic;-lineinfo") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic") # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-v") # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-warn-double-usage") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--keep") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d636cbb4..1b32b252 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,11 @@ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}) CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/popsift) +if(CUDA_VERSION_MAJOR GREATER_EQUAL 9) +# regression test for radix sort +set(REGRESSION_CODE popsift/regression/test_radix_sort.cu popsift/regression/test_radix_sort.h) +endif() + CUDA_ADD_LIBRARY(popsift popsift/popsift.cpp popsift/popsift.h popsift/features.cu popsift/features.h @@ -31,6 +36,7 @@ CUDA_ADD_LIBRARY(popsift popsift/s_desc_normalize.h popsift/s_gradiant.h popsift/s_solve.h + ${REGRESSION_CODE} popsift/common/assist.cu popsift/common/assist.h popsift/common/clamp.h popsift/common/plane_2d.cu popsift/common/plane_2d.h diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt index bc4b9fdc..d2908d77 100755 --- a/src/application/CMakeLists.txt +++ b/src/application/CMakeLists.txt @@ -1,5 +1,4 @@ -cmake_minimum_required(VERSION 3.0) -project(PopsiftDemo) +cmake_minimum_required(VERSION 3.7) if(TARGET popsift) # when compiled in the repository the target is already defined @@ -62,6 +61,23 @@ target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS}) set_target_properties(popsift-match PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) +if(CUDA_VERSION_MAJOR GREATER_EQUAL 9) +############################################################# +# test_radix_sort +############################################################# + +add_executable(test_radix_sort test_radix_sort.cpp) + +set_property(TARGET test_radix_sort PROPERTY CXX_STANDARD 11) + +target_compile_options(test_radix_sort PRIVATE ${PD_COMPILE_OPTIONS} ) +target_include_directories(test_radix_sort PUBLIC ${PD_INCLUDE_DIRS}) +target_compile_definitions(test_radix_sort PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB) +target_link_libraries(test_radix_sort PUBLIC PopSift::popsift ${PD_LINK_LIBS}) + +set_target_properties(test_radix_sort PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) +endif(CUDA_VERSION_MAJOR GREATER_EQUAL 9) + ############################################################# # installation ############################################################# diff --git a/src/application/test_radix_sort.cpp b/src/application/test_radix_sort.cpp new file mode 100644 index 00000000..42327e13 --- /dev/null +++ b/src/application/test_radix_sort.cpp @@ -0,0 +1,46 @@ +#include +#include +#include +#include "popsift/regression/test_radix_sort.h" +#include "popsift/common/device_prop.h" + +std::vector the_list(64); +int buffer[64]; + +int main() +{ + std::cout << "To test a specific NVIDIA card in your system:" << std::endl + << "export NVIDIA_VISIBLE_DEVICES=1" << std::endl + << "export CUDA_VISIBLE_DEVICES=" << std::endl + << std::endl; + + popsift::cuda::device_prop_t deviceInfo; + deviceInfo.set( 0, true ); + deviceInfo.print( ); + + for( int i=0; i<64; i++ ) the_list[i] = 100-i; + + for( int i=0; i<500; i++ ) + std::next_permutation( the_list.begin(), the_list.end() ); + std::reverse( the_list.begin(), the_list.end() ); + + for( int i=0; i<64; i++ ) + { + buffer[i] = the_list[i]; + std::cout << buffer[i] << " "; + } + std::cout << std::endl; + + TestRadix::push( buffer ); + + TestRadix::callSort(); + + TestRadix::pull( buffer ); + + for( int i=0; i<64; i++ ) + { + std::cout << buffer[i] << " "; + } + std::cout << std::endl; +} + diff --git a/src/popsift/common/assist.h b/src/popsift/common/assist.h index 5106fee2..338bc67d 100644 --- a/src/popsift/common/assist.h +++ b/src/popsift/common/assist.h @@ -34,6 +34,9 @@ template __device__ inline T shuffle_xor ( T variable, int delta ) { __device__ inline unsigned int ballot( unsigned int pred ) { return __ballot_sync ( 0xffffffff, pred ); } __device__ inline int any ( unsigned int pred ) { return __any_sync ( 0xffffffff, pred ); } __device__ inline int all ( unsigned int pred ) { return __all_sync ( 0xffffffff, pred ); } +__device__ inline void syncthreads() { __syncthreads(); } +__device__ inline void syncwarp() { __syncwarp( 0xffffffff ); } +__device__ inline unsigned int activemask() { return __activemask(); } template __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl_sync ( 0xffffffff, variable, src , ws ); } template __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up_sync ( 0xffffffff, variable, delta, ws ); } @@ -47,6 +50,9 @@ template __device__ inline T shuffle_xor ( T variable, int delta ) { __device__ inline unsigned int ballot( unsigned int pred ) { return __ballot ( pred ); } __device__ inline int any ( unsigned int pred ) { return __any ( pred ); } __device__ inline int all ( unsigned int pred ) { return __all ( pred ); } +__device__ inline void syncthreads() { __syncthreads(); } +__device__ inline void syncwarp() { } +__device__ inline unsigned int activemask() { return 0xffffffff; } template __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl ( variable, src , ws ); } template __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up ( variable, delta, ws ); } diff --git a/src/popsift/common/warp_bitonic_sort.h b/src/popsift/common/warp_bitonic_sort.h index 39b693a3..b1df04d8 100644 --- a/src/popsift/common/warp_bitonic_sort.h +++ b/src/popsift/common/warp_bitonic_sort.h @@ -66,8 +66,8 @@ class Warp32 : ( my_val < other_val ); const bool must_swap = not ( my_more ^ reverse ^ increasing ); - return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) - : my_index ); + int lane = must_swap ? ( 1 << shift ) : 0; + return popsift::shuffle_xor( my_index, lane ); } __device__ inline diff --git a/src/popsift/features.cu b/src/popsift/features.cu old mode 100755 new mode 100644 diff --git a/src/popsift/features.h b/src/popsift/features.h old mode 100755 new mode 100644 diff --git a/src/popsift/gauss_filter.cu b/src/popsift/gauss_filter.cu old mode 100755 new mode 100644 diff --git a/src/popsift/gauss_filter.h b/src/popsift/gauss_filter.h old mode 100755 new mode 100644 diff --git a/src/popsift/popsift.cpp b/src/popsift/popsift.cpp old mode 100755 new mode 100644 diff --git a/src/popsift/popsift.h b/src/popsift/popsift.h old mode 100755 new mode 100644 diff --git a/src/popsift/regression/bitosort.h b/src/popsift/regression/bitosort.h new file mode 100644 index 00000000..a6b97fbc --- /dev/null +++ b/src/popsift/regression/bitosort.h @@ -0,0 +1,95 @@ +/* + * Copyright 2016, Simula Research Laboratory + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include +#include +#include +#include + +#include "../common/assist.h" + +using namespace cooperative_groups; + +namespace popsift { +namespace BitonicSort { + +template +class Warp32 +{ + T* _array; +public: + __device__ inline + Warp32( T* array ) : _array( array ) { } + + __device__ inline + int sort32( int my_index ) + { + thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() ); + + for( int outer=0; outer<5; outer++ ) { + for( int inner=outer; inner>=0; inner-- ) { + my_index = shiftit( tile32, my_index, inner, outer+1, false ); + } + } + return my_index; + } + + __device__ inline + void sort64( int2& my_indeces ) + { + thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() ); + + for( int outer=0; outer<5; outer++ ) { + for( int inner=outer; inner>=0; inner-- ) { + my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false ); + my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, true ); + } + } + + if( _array[my_indeces.x] < _array[my_indeces.y] ) swap( my_indeces.x, my_indeces.y ); + + for( int outer=0; outer<5; outer++ ) { + for( int inner=outer; inner>=0; inner-- ) { + my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false ); + my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, false ); + } + } + } + +private: + __device__ inline + int shiftit( thread_block_tile<32>& tile32, + const int my_index, + const int shift, const int direction, const bool increasing ) + { + const T my_val = _array[my_index]; + const T other_val = tile32.shfl_xor( my_val, 1 << shift ); // popsift::shuffle_xor( my_val, 1 << shift ); + const bool reverse = ( threadIdx.x & ( 1 << direction ) ); + const bool id_less = ( ( threadIdx.x & ( 1 << shift ) ) == 0 ); + const bool my_more = id_less ? ( my_val > other_val ) + : ( my_val < other_val ); + const bool must_swap = not ( my_more ^ reverse ^ increasing ); + + // return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) : my_index ); + int lane = must_swap ? ( 1 << shift ) : 0; + int retval = tile32.shfl_xor( my_index, lane ); + return retval; + } + + __device__ inline + void swap( int& l, int& r ) + { + int m = r; + r = l; + l = m; + } +}; +} // namespace popsift +} // namespace BitonicSort + diff --git a/src/popsift/regression/test_radix_sort.cu b/src/popsift/regression/test_radix_sort.cu new file mode 100644 index 00000000..b6252f9e --- /dev/null +++ b/src/popsift/regression/test_radix_sort.cu @@ -0,0 +1,52 @@ +#include +#include "../common/assist.h" +#include "bitosort.h" +#include "test_radix_sort.h" + +namespace TestRadix +{ +__device__ __managed__ int buffer[64]; + +__shared__ int sh_val[64]; + +__host__ void push( int* b ) +{ + for( int i=0; i<64; i++ ) + buffer[i] = b[i]; +} + +__host__ void pull( int* b ) +{ + for( int i=0; i<64; i++ ) + b[i] = buffer[i]; +} + +__global__ void gpuCallSort( ) +{ + int x = threadIdx.x; + + sh_val[x] = buffer[x]; + sh_val[x+32] = buffer[x+32]; + __syncthreads(); + + int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 ); + + popsift::BitonicSort::Warp32 sorter( sh_val ); + sorter.sort64( best_index ); + // sorter.sort32( threadIdx.x ); + __syncthreads(); + + buffer[x] = sh_val[best_index.x]; + buffer[x+32] = sh_val[best_index.y]; +} + +__host__ void callSort( ) +{ + dim3 block( 32, 1, 1 ); + + gpuCallSort<<<1,block>>>( ); + cudaDeviceSynchronize(); +} + +}; + diff --git a/src/popsift/regression/test_radix_sort.h b/src/popsift/regression/test_radix_sort.h new file mode 100644 index 00000000..29f1c6e3 --- /dev/null +++ b/src/popsift/regression/test_radix_sort.h @@ -0,0 +1,9 @@ +#include + +namespace TestRadix +{ +__host__ void push( int* b ); +__host__ void pull( int* b ); +__host__ void callSort( ); +}; + diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu index e6491a95..8581bc1d 100644 --- a/src/popsift/s_desc_loop.cu +++ b/src/popsift/s_desc_loop.cu @@ -76,46 +76,49 @@ void ext_desc_loop_sub( const float ang, float dpt[9] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; - for( int i = threadIdx.x; i < loops; i+=blockDim.x ) + for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x ) { - const int ii = i / wx + ymin; - const int jj = i % wx + xmin; - - const float2 d = make_float2( jj - ptx, ii - pty ); - - // const float nx = crsbp * dx + srsbp * dy; - // const float ny = crsbp * dy - srsbp * dx; - const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), - ::fmaf( crsbp, d.y, -srsbp * d.x ) ); - const float2 nn = abs(n); - if (nn.x < 1.0f && nn.y < 1.0f) { - float mod; - float th; - get_gradiant( mod, th, jj, ii, layer_tex, level ); - - const float2 dn = n + offsetpt; - const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3)); - // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup ! - const float2 w = make_float2( 1.0f - nn.x, + if( i < loops ) + { + const int ii = i / wx + ymin; + const int jj = i % wx + xmin; + + const float2 d = make_float2( jj - ptx, ii - pty ); + + // const float nx = crsbp * dx + srsbp * dy; + // const float ny = crsbp * dy - srsbp * dx; + const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), + ::fmaf( crsbp, d.y, -srsbp * d.x ) ); + const float2 nn = abs(n); + if (nn.x < 1.0f && nn.y < 1.0f) { + float mod; + float th; + get_gradiant( mod, th, jj, ii, layer_tex, level ); + + const float2 dn = n + offsetpt; + const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3)); + // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup ! + const float2 w = make_float2( 1.0f - nn.x, 1.0f - nn.y ); - const float wgt = ww * w.x * w.y * mod; - - th -= ang; - th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2; - th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2; - - const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI; - const int fo0 = (int)floorf(tth); - const float do0 = tth - fo0; - const float wgt1 = 1.0f - do0; - const float wgt2 = do0; - - int fo = fo0 % DESC_BINS; - - // maf: multiply-add - // _ru - round to positive infinity equiv to froundf since always >=0 - dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt); - dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt); + const float wgt = ww * w.x * w.y * mod; + + th -= ang; + th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2; + th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2; + + const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI; + const int fo0 = (int)floorf(tth); + const float do0 = tth - fo0; + const float wgt1 = 1.0f - do0; + const float wgt2 = do0; + + int fo = fo0 % DESC_BINS; + + // maf: multiply-add + // _ru - round to positive infinity equiv to froundf since always >=0 + dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt); + dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt); + } } __syncthreads(); } diff --git a/src/popsift/s_image.cu b/src/popsift/s_image.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_image.h b/src/popsift/s_image.h old mode 100755 new mode 100644 diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index 64ac5863..ab11afd4 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -71,10 +71,10 @@ void ori_par( const int octave, const int iext_off = dobuf.i_ext_off[octave][extremum_index]; const InitialExtremum* iext = &dobuf.i_ext_dat[octave][iext_off]; - __shared__ float hist [ORI_NBINS]; - __shared__ float sm_hist[ORI_NBINS]; + __shared__ float _sh_hist [ORI_NBINS]; + __shared__ float _sh_sm_hist[ORI_NBINS]; - for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x ) hist[i] = 0.0f; + for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x ) _sh_hist[i] = 0.0f; /* keypoint fractional geometry */ const float x = iext->xpos; @@ -133,7 +133,7 @@ void ori_par( const int octave, bidx = (bidx == ORI_NBINS) ? 0 : bidx; - atomicAdd( &hist[bidx], weight ); + atomicAdd( &_sh_hist[bidx], weight ); } } __syncthreads(); @@ -144,18 +144,18 @@ void ori_par( const int octave, for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) { int prev = bin == 0 ? ORI_NBINS-1 : bin-1; int next = bin == ORI_NBINS-1 ? 0 : bin+1; - sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3.0f; + _sh_sm_hist[bin] = ( _sh_hist[prev] + _sh_hist[bin] + _sh_hist[next] ) / 3.0f; } __syncthreads(); for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) { int prev = bin == 0 ? ORI_NBINS-1 : bin-1; int next = bin == ORI_NBINS-1 ? 0 : bin+1; - hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3.0f; + _sh_hist[bin] = ( _sh_sm_hist[prev] + _sh_sm_hist[bin] + _sh_sm_hist[next] ) / 3.0f; } __syncthreads(); } for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) { - sm_hist[bin] = hist[bin]; + _sh_sm_hist[bin] = _sh_hist[bin]; } __syncthreads(); #else // not WITH_VLFEAT_SMOOTHING @@ -168,51 +168,51 @@ void ori_par( const int octave, if( prev1 < 0 ) prev1 += ORI_NBINS; if( next1 >= ORI_NBINS ) next1 -= ORI_NBINS; if( next2 >= ORI_NBINS ) next2 -= ORI_NBINS; - sm_hist[bin] = ( hist[prev2] + hist[next2] - + ( hist[prev1] + hist[next1] ) * 4.0f - + hist[bin] * 6.0f ) / 16.0f; + _sh_sm_hist[bin] = ( _sh_hist[prev2] + _sh_hist[next2] + + ( _sh_hist[prev1] + _sh_hist[next1] ) * 4.0f + + _sh_hist[bin] * 6.0f ) / 16.0f; } __syncthreads(); #endif // not WITH_VLFEAT_SMOOTHING // sub-cell refinement of the histogram cell index, yielding the angle // not necessary to initialize, every cell is computed - __shared__ float refined_angle[64]; - __shared__ float yval [64]; + __shared__ float _sh_refined_angle[64]; + __shared__ float _sh_yval [64]; for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) { const int prev = bin == 0 ? ORI_NBINS-1 : bin-1; const int next = bin == ORI_NBINS-1 ? 0 : bin+1; - bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ); + bool predicate = ( bin < ORI_NBINS ) && ( _sh_sm_hist[bin] > max( _sh_sm_hist[prev], _sh_sm_hist[next] ) ); - const float num = predicate ? 3.0f * sm_hist[prev] - - 4.0f * sm_hist[bin] - + 1.0f * sm_hist[next] + const float num = predicate ? 3.0f * _sh_sm_hist[prev] + - 4.0f * _sh_sm_hist[bin] + + 1.0f * _sh_sm_hist[next] : 0.0f; - // const float num = predicate ? 2.0f * sm_hist[prev] - // - 4.0f * sm_hist[bin] - // + 2.0f * sm_hist[next] + // const float num = predicate ? 2.0f * _sh_sm_hist[prev] + // - 4.0f * _sh_sm_hist[bin] + // + 2.0f * _sh_sm_hist[next] // : 0.0f; - const float denB = predicate ? 2.0f * ( sm_hist[prev] - 2.0f * sm_hist[bin] + sm_hist[next] ) : 1.0f; + const float denB = predicate ? 2.0f * ( _sh_sm_hist[prev] - 2.0f * _sh_sm_hist[bin] + _sh_sm_hist[next] ) : 1.0f; const float newbin = __fdividef( num, denB ); // verified: accuracy OK predicate = ( predicate && newbin >= 0.0f && newbin <= 2.0f ); - refined_angle[bin] = predicate ? prev + newbin : -1; - yval[bin] = predicate ? -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY; + _sh_refined_angle[bin] = predicate ? prev + newbin : -1; + _sh_yval[bin] = predicate ? -(num*num) / (4.0f * denB) + _sh_sm_hist[prev] : -INFINITY; } int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 ); - BitonicSort::Warp32 sorter( yval ); + BitonicSort::Warp32 sorter( _sh_yval ); sorter.sort64( best_index ); __syncthreads(); // All threads retrieve the yval of thread 0, the largest // of all yvals. - const float best_val = yval[best_index.x]; + const float best_val = _sh_yval[best_index.x]; const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 ); const bool valid = ( best_val >= yval_ref ); bool written = false; @@ -221,7 +221,7 @@ void ori_par( const int octave, if( threadIdx.x < ORIENTATION_MAX_COUNT ) { if( valid ) { - float chosen_bin = refined_angle[best_index.x]; + float chosen_bin = _sh_refined_angle[best_index.x]; if( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS; // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI; float th = ::fmaf( M_PI2 * chosen_bin, 1.0f/ORI_NBINS, - M_PI ); diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_ra.cu b/src/popsift/s_pyramid_build_ra.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_build_ra.h b/src/popsift/s_pyramid_build_ra.h old mode 100755 new mode 100644 diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu old mode 100755 new mode 100644 diff --git a/src/popsift/s_solve.h b/src/popsift/s_solve.h old mode 100755 new mode 100644 diff --git a/src/popsift/sift_constants.cu b/src/popsift/sift_constants.cu old mode 100755 new mode 100644 diff --git a/src/popsift/sift_constants.h b/src/popsift/sift_constants.h old mode 100755 new mode 100644 diff --git a/src/popsift/sift_extremum.cu b/src/popsift/sift_extremum.cu old mode 100755 new mode 100644 diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h old mode 100755 new mode 100644 diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu old mode 100755 new mode 100644 diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h old mode 100755 new mode 100644 diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu old mode 100755 new mode 100644 diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h old mode 100755 new mode 100644