diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72514f25..d592ee66 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 # CMake below 3.4 does not work with CUDA separable compilation at all
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.7)
 
 project(PopSift VERSION 1.0.0)
 
@@ -72,7 +72,11 @@ endif()
 # Default setting of the CUDA CC versions to compile.
 # Shortening the lists saves a lot of compile time.
 #
-if(CUDA_VERSION_MAJOR GREATER 7)
+if(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
+  set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72 75)
+elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+  set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72)
+elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 8)
   set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62)
 else()
   set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 )
@@ -91,7 +95,7 @@ endif()
 set(CUDA_SEPARABLE_COMPILATION ON)
 
 if(UNIX AND NOT APPLE)
-  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic;-lineinfo")
+  set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic")
   # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xptxas;-v")
   # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};-Xptxas;-warn-double-usage")
   set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--keep")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d636cbb4..1b32b252 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,6 +2,11 @@ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})
 
 CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/popsift)
 
+if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+# regression test for radix sort
+set(REGRESSION_CODE popsift/regression/test_radix_sort.cu popsift/regression/test_radix_sort.h)
+endif()
+
 CUDA_ADD_LIBRARY(popsift
 	popsift/popsift.cpp popsift/popsift.h
 	popsift/features.cu popsift/features.h
@@ -31,6 +36,7 @@ CUDA_ADD_LIBRARY(popsift
 	popsift/s_desc_normalize.h
 	popsift/s_gradiant.h
 	popsift/s_solve.h
+	${REGRESSION_CODE}
 	popsift/common/assist.cu popsift/common/assist.h
 	popsift/common/clamp.h
 	popsift/common/plane_2d.cu popsift/common/plane_2d.h
diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
index bc4b9fdc..d2908d77 100755
--- a/src/application/CMakeLists.txt
+++ b/src/application/CMakeLists.txt
@@ -1,5 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
-project(PopsiftDemo)
+cmake_minimum_required(VERSION 3.7)
 
 if(TARGET popsift)
   # when compiled in the repository the target is already defined
@@ -62,6 +61,23 @@ target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
 set_target_properties(popsift-match  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
 
+if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+#############################################################
+# test_radix_sort
+#############################################################
+
+add_executable(test_radix_sort test_radix_sort.cpp)
+
+set_property(TARGET test_radix_sort PROPERTY CXX_STANDARD 11)
+
+target_compile_options(test_radix_sort PRIVATE ${PD_COMPILE_OPTIONS} )
+target_include_directories(test_radix_sort PUBLIC ${PD_INCLUDE_DIRS})
+target_compile_definitions(test_radix_sort PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
+target_link_libraries(test_radix_sort PUBLIC PopSift::popsift ${PD_LINK_LIBS})
+
+set_target_properties(test_radix_sort  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
+endif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+
 #############################################################
 # installation
 #############################################################
diff --git a/src/application/test_radix_sort.cpp b/src/application/test_radix_sort.cpp
new file mode 100644
index 00000000..42327e13
--- /dev/null
+++ b/src/application/test_radix_sort.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include "popsift/regression/test_radix_sort.h"
+#include "popsift/common/device_prop.h"
+
+std::vector<int> the_list(64);
+int buffer[64];
+
+int main()
+{
+    std::cout << "To test a specific NVIDIA card in your system:" << std::endl
+              << "export NVIDIA_VISIBLE_DEVICES=1" << std::endl
+              << "export CUDA_VISIBLE_DEVICES=<int>" << std::endl
+              << std::endl;
+
+    popsift::cuda::device_prop_t deviceInfo;
+    deviceInfo.set( 0, true );
+    deviceInfo.print( );
+
+    for( int i=0; i<64; i++ ) the_list[i] = 100-i;
+
+    for( int i=0; i<500; i++ )
+        std::next_permutation( the_list.begin(), the_list.end() );
+    std::reverse( the_list.begin(), the_list.end() );
+
+    for( int i=0; i<64; i++ )
+    {
+        buffer[i] = the_list[i];
+        std::cout << buffer[i] << " ";
+    }
+    std::cout << std::endl;
+
+    TestRadix::push( buffer );
+
+    TestRadix::callSort();
+
+    TestRadix::pull( buffer );
+
+    for( int i=0; i<64; i++ )
+    {
+        std::cout << buffer[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
diff --git a/src/popsift/common/assist.h b/src/popsift/common/assist.h
index 5106fee2..338bc67d 100644
--- a/src/popsift/common/assist.h
+++ b/src/popsift/common/assist.h
@@ -34,6 +34,9 @@ template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) {
 __device__ inline unsigned int ballot( unsigned int pred ) { return __ballot_sync   ( 0xffffffff, pred ); }
 __device__ inline int any            ( unsigned int pred ) { return __any_sync      ( 0xffffffff, pred ); }
 __device__ inline int all            ( unsigned int pred ) { return __all_sync      ( 0xffffffff, pred ); }
+__device__ inline void syncthreads() { __syncthreads(); }
+__device__ inline void syncwarp()    { __syncwarp( 0xffffffff ); }
+__device__ inline unsigned int activemask() { return __activemask(); }
 
 template<typename T> __device__ inline T shuffle     ( T variable, int src  , int ws ) { return __shfl_sync     ( 0xffffffff, variable, src  , ws ); }
 template<typename T> __device__ inline T shuffle_up  ( T variable, int delta, int ws ) { return __shfl_up_sync  ( 0xffffffff, variable, delta, ws ); }
@@ -47,6 +50,9 @@ template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) {
 __device__ inline unsigned int ballot( unsigned int pred ) { return __ballot   ( pred ); }
 __device__ inline int any            ( unsigned int pred ) { return __any      ( pred ); }
 __device__ inline int all            ( unsigned int pred ) { return __all      ( pred ); }
+__device__ inline void syncthreads() { __syncthreads(); }
+__device__ inline void syncwarp()    { }
+__device__ inline unsigned int activemask() { return 0xffffffff; }
 
 template<typename T> __device__ inline T shuffle     ( T variable, int src  , int ws ) { return __shfl     ( variable, src  , ws ); }
 template<typename T> __device__ inline T shuffle_up  ( T variable, int delta, int ws ) { return __shfl_up  ( variable, delta, ws ); }
diff --git a/src/popsift/common/warp_bitonic_sort.h b/src/popsift/common/warp_bitonic_sort.h
index 39b693a3..b1df04d8 100644
--- a/src/popsift/common/warp_bitonic_sort.h
+++ b/src/popsift/common/warp_bitonic_sort.h
@@ -66,8 +66,8 @@ class Warp32
                                          : ( my_val < other_val );
         const bool must_swap   = not ( my_more ^ reverse ^ increasing );
 
-        return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift )
-                           : my_index );
+        int lane = must_swap ? ( 1 << shift ) : 0;
+        return popsift::shuffle_xor( my_index, lane );
     }
 
     __device__ inline
diff --git a/src/popsift/features.cu b/src/popsift/features.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/features.h b/src/popsift/features.h
old mode 100755
new mode 100644
diff --git a/src/popsift/gauss_filter.cu b/src/popsift/gauss_filter.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/gauss_filter.h b/src/popsift/gauss_filter.h
old mode 100755
new mode 100644
diff --git a/src/popsift/popsift.cpp b/src/popsift/popsift.cpp
old mode 100755
new mode 100644
diff --git a/src/popsift/popsift.h b/src/popsift/popsift.h
old mode 100755
new mode 100644
diff --git a/src/popsift/regression/bitosort.h b/src/popsift/regression/bitosort.h
new file mode 100644
index 00000000..a6b97fbc
--- /dev/null
+++ b/src/popsift/regression/bitosort.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2016, Simula Research Laboratory
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <iso646.h>
+#include <stdio.h>
+
+#include "../common/assist.h"
+
+using namespace cooperative_groups;
+
+namespace popsift {
+namespace BitonicSort {
+
+template<class T>
+class Warp32
+{
+    T*  _array;
+public:
+    __device__ inline
+    Warp32( T* array ) : _array( array ) { }
+
+    __device__ inline
+    int sort32( int my_index )
+    {
+        thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_index = shiftit( tile32, my_index, inner, outer+1, false );
+            }
+        }
+        return my_index;
+    }
+
+    __device__ inline
+    void sort64( int2& my_indeces )
+    {
+        thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
+                my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, true );
+            }
+        }
+
+        if( _array[my_indeces.x] < _array[my_indeces.y] ) swap( my_indeces.x, my_indeces.y );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
+                my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, false );
+            }
+        }
+    }
+
+private:
+    __device__ inline
+    int shiftit( thread_block_tile<32>& tile32,
+                 const int my_index,
+                 const int shift, const int direction, const bool increasing )
+    {
+        const T    my_val      = _array[my_index];
+        const T    other_val   = tile32.shfl_xor( my_val, 1 << shift ); // popsift::shuffle_xor( my_val, 1 << shift );
+        const bool reverse     = ( threadIdx.x & ( 1 << direction ) );
+        const bool id_less     = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
+        const bool my_more     = id_less ? ( my_val > other_val )
+                                         : ( my_val < other_val );
+        const bool must_swap   = not ( my_more ^ reverse ^ increasing );
+
+        // return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) : my_index );
+        int lane = must_swap ? ( 1 << shift ) : 0;
+        int retval = tile32.shfl_xor( my_index, lane );
+        return retval;
+    }
+
+    __device__ inline
+    void swap( int& l, int& r )
+    {
+        int m = r;
+        r = l;
+        l = m;
+    }
+};
+} // namespace popsift
+} // namespace BitonicSort
+
diff --git a/src/popsift/regression/test_radix_sort.cu b/src/popsift/regression/test_radix_sort.cu
new file mode 100644
index 00000000..b6252f9e
--- /dev/null
+++ b/src/popsift/regression/test_radix_sort.cu
@@ -0,0 +1,52 @@
+#include <cuda_runtime.h>
+#include "../common/assist.h"
+#include "bitosort.h"
+#include "test_radix_sort.h"
+
+namespace TestRadix
+{
+__device__ __managed__ int buffer[64];
+
+__shared__ int sh_val[64];
+
+__host__ void push( int* b )
+{
+    for( int i=0; i<64; i++ )
+        buffer[i] = b[i];
+}
+
+__host__ void pull( int* b )
+{
+    for( int i=0; i<64; i++ )
+        b[i] = buffer[i];
+}
+
+__global__ void gpuCallSort( )
+{
+    int x = threadIdx.x;
+
+    sh_val[x]    = buffer[x];
+    sh_val[x+32] = buffer[x+32];
+    __syncthreads();
+
+    int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );
+
+    popsift::BitonicSort::Warp32<int> sorter( sh_val );
+    sorter.sort64( best_index );
+    // sorter.sort32( threadIdx.x );
+    __syncthreads();
+
+    buffer[x]    = sh_val[best_index.x];
+    buffer[x+32] = sh_val[best_index.y];
+}
+
+__host__ void callSort( )
+{
+    dim3 block( 32, 1, 1 );
+
+    gpuCallSort<<<1,block>>>( );
+    cudaDeviceSynchronize();
+}
+
+};
+
diff --git a/src/popsift/regression/test_radix_sort.h b/src/popsift/regression/test_radix_sort.h
new file mode 100644
index 00000000..29f1c6e3
--- /dev/null
+++ b/src/popsift/regression/test_radix_sort.h
@@ -0,0 +1,9 @@
+#include <cuda_runtime.h>
+
+namespace TestRadix
+{
+__host__ void push( int* b );
+__host__ void pull( int* b );
+__host__ void callSort( );
+};
+
diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu
index e6491a95..8581bc1d 100644
--- a/src/popsift/s_desc_loop.cu
+++ b/src/popsift/s_desc_loop.cu
@@ -76,46 +76,49 @@ void ext_desc_loop_sub( const float         ang,
 
     float dpt[9] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 
-    for( int i = threadIdx.x; i < loops; i+=blockDim.x )
+    for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x )
     {
-        const int ii = i / wx + ymin;
-        const int jj = i % wx + xmin;     
-
-        const float2 d = make_float2( jj - ptx, ii - pty );
-
-        // const float nx = crsbp * dx + srsbp * dy;
-        // const float ny = crsbp * dy - srsbp * dx;
-        const float2 n = make_float2( ::fmaf( crsbp, d.x,  srsbp * d.y ),
-                                      ::fmaf( crsbp, d.y, -srsbp * d.x ) );
-        const float2 nn = abs(n);
-        if (nn.x < 1.0f && nn.y < 1.0f) {
-            float mod;
-            float th;
-            get_gradiant( mod, th, jj, ii, layer_tex, level );
-
-            const float2 dn = n + offsetpt;
-            const float  ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
-            // const float ww  = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
-            const float2 w  = make_float2( 1.0f - nn.x,
+        if( i < loops )
+        {
+            const int ii = i / wx + ymin;
+            const int jj = i % wx + xmin;     
+
+            const float2 d = make_float2( jj - ptx, ii - pty );
+
+            // const float nx = crsbp * dx + srsbp * dy;
+            // const float ny = crsbp * dy - srsbp * dx;
+            const float2 n = make_float2( ::fmaf( crsbp, d.x,  srsbp * d.y ),
+                                          ::fmaf( crsbp, d.y, -srsbp * d.x ) );
+            const float2 nn = abs(n);
+            if (nn.x < 1.0f && nn.y < 1.0f) {
+                float mod;
+                float th;
+                get_gradiant( mod, th, jj, ii, layer_tex, level );
+
+                const float2 dn = n + offsetpt;
+                const float  ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
+                // const float ww  = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
+                const float2 w  = make_float2( 1.0f - nn.x,
                                            1.0f - nn.y );
-            const float wgt = ww * w.x * w.y * mod;
-
-            th -= ang;
-            th += ( th <  0.0f  ? M_PI2 : 0.0f ); //  if (th <  0.0f ) th += M_PI2;
-            th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); //  if (th >= M_PI2) th -= M_PI2;
-
-            const float tth  = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
-            const int   fo0  = (int)floorf(tth);
-            const float do0  = tth - fo0;             
-            const float wgt1 = 1.0f - do0;
-            const float wgt2 = do0;
-
-            int fo  = fo0 % DESC_BINS;
-
-                // maf: multiply-add
-                // _ru - round to positive infinity equiv to froundf since always >=0
-            dpt[fo]   = __fmaf_ru( wgt1, wgt, dpt[fo] );   // dpt[fo]   += (wgt1*wgt);
-            dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
+                const float wgt = ww * w.x * w.y * mod;
+
+                th -= ang;
+                th += ( th <  0.0f  ? M_PI2 : 0.0f ); //  if (th <  0.0f ) th += M_PI2;
+                th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); //  if (th >= M_PI2) th -= M_PI2;
+
+                const float tth  = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
+                const int   fo0  = (int)floorf(tth);
+                const float do0  = tth - fo0;             
+                const float wgt1 = 1.0f - do0;
+                const float wgt2 = do0;
+
+                int fo  = fo0 % DESC_BINS;
+    
+                    // maf: multiply-add
+                    // _ru - round to positive infinity equiv to froundf since always >=0
+                dpt[fo]   = __fmaf_ru( wgt1, wgt, dpt[fo] );   // dpt[fo]   += (wgt1*wgt);
+                dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
+            }
         }
         __syncthreads();
     }
diff --git a/src/popsift/s_image.cu b/src/popsift/s_image.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_image.h b/src/popsift/s_image.h
old mode 100755
new mode 100644
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
index 64ac5863..ab11afd4 100644
--- a/src/popsift/s_orientation.cu
+++ b/src/popsift/s_orientation.cu
@@ -71,10 +71,10 @@ void ori_par( const int           octave,
     const int              iext_off =  dobuf.i_ext_off[octave][extremum_index];
     const InitialExtremum* iext     = &dobuf.i_ext_dat[octave][iext_off];
 
-    __shared__ float hist   [ORI_NBINS];
-    __shared__ float sm_hist[ORI_NBINS];
+    __shared__ float _sh_hist   [ORI_NBINS];
+    __shared__ float _sh_sm_hist[ORI_NBINS];
 
-    for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x )  hist[i] = 0.0f;
+    for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x )  _sh_hist[i] = 0.0f;
 
     /* keypoint fractional geometry */
     const float x     = iext->xpos;
@@ -133,7 +133,7 @@ void ori_par( const int           octave,
 
                 bidx = (bidx == ORI_NBINS) ? 0 : bidx;
 
-                atomicAdd( &hist[bidx], weight );
+                atomicAdd( &_sh_hist[bidx], weight );
             }
         }
         __syncthreads();
@@ -144,18 +144,18 @@ void ori_par( const int           octave,
         for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
             int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
             int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3.0f;
+            _sh_sm_hist[bin] = ( _sh_hist[prev] + _sh_hist[bin] + _sh_hist[next] ) / 3.0f;
         }
         __syncthreads();
         for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
             int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
             int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3.0f;
+            _sh_hist[bin] = ( _sh_sm_hist[prev] + _sh_sm_hist[bin] + _sh_sm_hist[next] ) / 3.0f;
         }
         __syncthreads();
     }
     for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-        sm_hist[bin] = hist[bin];
+        _sh_sm_hist[bin] = _sh_hist[bin];
     }
     __syncthreads();
 #else // not WITH_VLFEAT_SMOOTHING
@@ -168,51 +168,51 @@ void ori_par( const int           octave,
         if( prev1 < 0 )          prev1 += ORI_NBINS;
         if( next1 >= ORI_NBINS ) next1 -= ORI_NBINS;
         if( next2 >= ORI_NBINS ) next2 -= ORI_NBINS;
-        sm_hist[bin] = (   hist[prev2] + hist[next2]
-                         + ( hist[prev1] + hist[next1] ) * 4.0f
-                         +   hist[bin] * 6.0f ) / 16.0f;
+        _sh_sm_hist[bin] = ( _sh_hist[prev2] + _sh_hist[next2]
+                         + ( _sh_hist[prev1] + _sh_hist[next1] ) * 4.0f
+                         +   _sh_hist[bin] * 6.0f ) / 16.0f;
     }
     __syncthreads();
 #endif // not WITH_VLFEAT_SMOOTHING
 
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
-    __shared__ float refined_angle[64];
-    __shared__ float yval         [64];
+    __shared__ float _sh_refined_angle[64];
+    __shared__ float _sh_yval         [64];
 
     for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
         const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
         const int next = bin == ORI_NBINS-1 ? 0 : bin+1;
 
-        bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) );
+        bool predicate = ( bin < ORI_NBINS ) && ( _sh_sm_hist[bin] > max( _sh_sm_hist[prev], _sh_sm_hist[next] ) );
 
-        const float num  = predicate ?   3.0f * sm_hist[prev]
-                                       - 4.0f * sm_hist[bin]
-                                       + 1.0f * sm_hist[next]
+        const float num  = predicate ?   3.0f * _sh_sm_hist[prev]
+                                       - 4.0f * _sh_sm_hist[bin]
+                                       + 1.0f * _sh_sm_hist[next]
                                      : 0.0f;
-        // const float num  = predicate ?   2.0f * sm_hist[prev]
-        //                                - 4.0f * sm_hist[bin]
-        //                                + 2.0f * sm_hist[next]
+        // const float num  = predicate ?   2.0f * _sh_sm_hist[prev]
+        //                                - 4.0f * _sh_sm_hist[bin]
+        //                                + 2.0f * _sh_sm_hist[next]
         //                              : 0.0f;
-        const float denB = predicate ? 2.0f * ( sm_hist[prev] - 2.0f * sm_hist[bin] + sm_hist[next] ) : 1.0f;
+        const float denB = predicate ? 2.0f * ( _sh_sm_hist[prev] - 2.0f * _sh_sm_hist[bin] + _sh_sm_hist[next] ) : 1.0f;
 
         const float newbin = __fdividef( num, denB ); // verified: accuracy OK
 
         predicate   = ( predicate && newbin >= 0.0f && newbin <= 2.0f );
 
-        refined_angle[bin] = predicate ? prev + newbin : -1;
-        yval[bin]          = predicate ?  -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY;
+        _sh_refined_angle[bin] = predicate ? prev + newbin : -1;
+        _sh_yval[bin]          = predicate ?  -(num*num) / (4.0f * denB) + _sh_sm_hist[prev] : -INFINITY;
     }
 
     int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );
 
-    BitonicSort::Warp32<float> sorter( yval );
+    BitonicSort::Warp32<float> sorter( _sh_yval );
     sorter.sort64( best_index );
     __syncthreads();
 
     // All threads retrieve the yval of thread 0, the largest
     // of all yvals.
-    const float best_val = yval[best_index.x];
+    const float best_val = _sh_yval[best_index.x];
     const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 );
     const bool  valid    = ( best_val >= yval_ref );
     bool        written  = false;
@@ -221,7 +221,7 @@ void ori_par( const int           octave,
 
     if( threadIdx.x < ORIENTATION_MAX_COUNT ) {
         if( valid ) {
-            float chosen_bin = refined_angle[best_index.x];
+            float chosen_bin = _sh_refined_angle[best_index.x];
             if( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS;
             // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI;
             float th = ::fmaf( M_PI2 * chosen_bin, 1.0f/ORI_NBINS, - M_PI );
diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_ra.cu b/src/popsift/s_pyramid_build_ra.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_build_ra.h b/src/popsift/s_pyramid_build_ra.h
old mode 100755
new mode 100644
diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/s_solve.h b/src/popsift/s_solve.h
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_constants.cu b/src/popsift/sift_constants.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_constants.h b/src/popsift/sift_constants.h
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_extremum.cu b/src/popsift/sift_extremum.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
old mode 100755
new mode 100644
diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h
old mode 100755
new mode 100644