Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/fix new warp #66

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# CMake below 3.4 does not work with CUDA separable compilation at all
cmake_minimum_required(VERSION 3.4)
cmake_minimum_required(VERSION 3.7)

project(PopSift VERSION 1.0.0)

Expand Down Expand Up @@ -72,7 +72,11 @@ endif()
# Default setting of the CUDA CC versions to compile.
# Shortening the lists saves a lot of compile time.
#
if(CUDA_VERSION_MAJOR GREATER 7)
if(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72 75)
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62 70 72)
elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 8)
set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 60 61 62)
else()
set(PopSift_CUDA_CC_LIST_BASIC 30 35 50 52 )
Expand All @@ -91,7 +95,7 @@ endif()
set(CUDA_SEPARABLE_COMPILATION ON)

if(UNIX AND NOT APPLE)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic;-lineinfo")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler;-rdynamic")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-v")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xptxas;-warn-double-usage")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--keep")
Expand Down
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})

CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/popsift)

if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
# regression test for radix sort
set(REGRESSION_CODE popsift/regression/test_radix_sort.cu popsift/regression/test_radix_sort.h)
endif()

CUDA_ADD_LIBRARY(popsift
popsift/popsift.cpp popsift/popsift.h
popsift/features.cu popsift/features.h
Expand Down Expand Up @@ -31,6 +36,7 @@ CUDA_ADD_LIBRARY(popsift
popsift/s_desc_normalize.h
popsift/s_gradiant.h
popsift/s_solve.h
${REGRESSION_CODE}
popsift/common/assist.cu popsift/common/assist.h
popsift/common/clamp.h
popsift/common/plane_2d.cu popsift/common/plane_2d.h
Expand Down
20 changes: 18 additions & 2 deletions src/application/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
cmake_minimum_required(VERSION 3.0)
project(PopsiftDemo)
cmake_minimum_required(VERSION 3.7)

if(TARGET popsift)
# when compiled in the repository the target is already defined
Expand Down Expand Up @@ -62,6 +61,23 @@ target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})

set_target_properties(popsift-match PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )

if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
#############################################################
# test_radix_sort
#############################################################

add_executable(test_radix_sort test_radix_sort.cpp)

set_property(TARGET test_radix_sort PROPERTY CXX_STANDARD 11)

target_compile_options(test_radix_sort PRIVATE ${PD_COMPILE_OPTIONS} )
target_include_directories(test_radix_sort PUBLIC ${PD_INCLUDE_DIRS})
target_compile_definitions(test_radix_sort PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
target_link_libraries(test_radix_sort PUBLIC PopSift::popsift ${PD_LINK_LIBS})

set_target_properties(test_radix_sort PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
endif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)

#############################################################
# installation
#############################################################
Expand Down
46 changes: 46 additions & 0 deletions src/application/test_radix_sort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include <iostream>
#include <vector>
#include <algorithm>
#include "popsift/regression/test_radix_sort.h"
#include "popsift/common/device_prop.h"

std::vector<int> the_list(64);
int buffer[64];

int main()
{
std::cout << "To test a specific NVIDIA card in your system:" << std::endl
<< "export NVIDIA_VISIBLE_DEVICES=1" << std::endl
<< "export CUDA_VISIBLE_DEVICES=<int>" << std::endl
<< std::endl;

popsift::cuda::device_prop_t deviceInfo;
deviceInfo.set( 0, true );
deviceInfo.print( );

for( int i=0; i<64; i++ ) the_list[i] = 100-i;

for( int i=0; i<500; i++ )
std::next_permutation( the_list.begin(), the_list.end() );
std::reverse( the_list.begin(), the_list.end() );

for( int i=0; i<64; i++ )
{
buffer[i] = the_list[i];
std::cout << buffer[i] << " ";
}
std::cout << std::endl;

TestRadix::push( buffer );

TestRadix::callSort();

TestRadix::pull( buffer );

for( int i=0; i<64; i++ )
{
std::cout << buffer[i] << " ";
}
std::cout << std::endl;
}

6 changes: 6 additions & 0 deletions src/popsift/common/assist.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) {
__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot_sync ( 0xffffffff, pred ); }
__device__ inline int any ( unsigned int pred ) { return __any_sync ( 0xffffffff, pred ); }
__device__ inline int all ( unsigned int pred ) { return __all_sync ( 0xffffffff, pred ); }
__device__ inline void syncthreads() { __syncthreads(); }
__device__ inline void syncwarp() { __syncwarp( 0xffffffff ); }
__device__ inline unsigned int activemask() { return __activemask(); }

template<typename T> __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl_sync ( 0xffffffff, variable, src , ws ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up_sync ( 0xffffffff, variable, delta, ws ); }
Expand All @@ -47,6 +50,9 @@ template<typename T> __device__ inline T shuffle_xor ( T variable, int delta ) {
__device__ inline unsigned int ballot( unsigned int pred ) { return __ballot ( pred ); }
__device__ inline int any ( unsigned int pred ) { return __any ( pred ); }
__device__ inline int all ( unsigned int pred ) { return __all ( pred ); }
__device__ inline void syncthreads() { __syncthreads(); }
__device__ inline void syncwarp() { }
__device__ inline unsigned int activemask() { return 0xffffffff; }

template<typename T> __device__ inline T shuffle ( T variable, int src , int ws ) { return __shfl ( variable, src , ws ); }
template<typename T> __device__ inline T shuffle_up ( T variable, int delta, int ws ) { return __shfl_up ( variable, delta, ws ); }
Expand Down
4 changes: 2 additions & 2 deletions src/popsift/common/warp_bitonic_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ class Warp32
: ( my_val < other_val );
const bool must_swap = not ( my_more ^ reverse ^ increasing );

return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift )
: my_index );
int lane = must_swap ? ( 1 << shift ) : 0;
return popsift::shuffle_xor( my_index, lane );
}

__device__ inline
Expand Down
Empty file modified src/popsift/features.cu
100755 → 100644
Empty file.
Empty file modified src/popsift/features.h
100755 → 100644
Empty file.
Empty file modified src/popsift/gauss_filter.cu
100755 → 100644
Empty file.
Empty file modified src/popsift/gauss_filter.h
100755 → 100644
Empty file.
Empty file modified src/popsift/popsift.cpp
100755 → 100644
Empty file.
Empty file modified src/popsift/popsift.h
100755 → 100644
Empty file.
95 changes: 95 additions & 0 deletions src/popsift/regression/bitosort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright 2016, Simula Research Laboratory
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#pragma once

#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <iso646.h>
#include <stdio.h>

#include "../common/assist.h"

using namespace cooperative_groups;

namespace popsift {
namespace BitonicSort {

template<class T>
class Warp32
{
T* _array;
public:
__device__ inline
Warp32( T* array ) : _array( array ) { }

__device__ inline
int sort32( int my_index )
{
thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );

for( int outer=0; outer<5; outer++ ) {
for( int inner=outer; inner>=0; inner-- ) {
my_index = shiftit( tile32, my_index, inner, outer+1, false );
}
}
return my_index;
}

__device__ inline
void sort64( int2& my_indeces )
{
thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );

for( int outer=0; outer<5; outer++ ) {
for( int inner=outer; inner>=0; inner-- ) {
my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, true );
}
}

if( _array[my_indeces.x] < _array[my_indeces.y] ) swap( my_indeces.x, my_indeces.y );

for( int outer=0; outer<5; outer++ ) {
for( int inner=outer; inner>=0; inner-- ) {
my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, false );
}
}
}

private:
__device__ inline
int shiftit( thread_block_tile<32>& tile32,
const int my_index,
const int shift, const int direction, const bool increasing )
{
const T my_val = _array[my_index];
const T other_val = tile32.shfl_xor( my_val, 1 << shift ); // popsift::shuffle_xor( my_val, 1 << shift );
const bool reverse = ( threadIdx.x & ( 1 << direction ) );
const bool id_less = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
const bool my_more = id_less ? ( my_val > other_val )
: ( my_val < other_val );
const bool must_swap = not ( my_more ^ reverse ^ increasing );

// return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) : my_index );
int lane = must_swap ? ( 1 << shift ) : 0;
int retval = tile32.shfl_xor( my_index, lane );
return retval;
}

__device__ inline
void swap( int& l, int& r )
{
int m = r;
r = l;
l = m;
}
};
} // namespace popsift
} // namespace BitonicSort

52 changes: 52 additions & 0 deletions src/popsift/regression/test_radix_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <cuda_runtime.h>
#include "../common/assist.h"
#include "bitosort.h"
#include "test_radix_sort.h"

namespace TestRadix
{
__device__ __managed__ int buffer[64];

__shared__ int sh_val[64];

__host__ void push( int* b )
{
for( int i=0; i<64; i++ )
buffer[i] = b[i];
}

__host__ void pull( int* b )
{
for( int i=0; i<64; i++ )
b[i] = buffer[i];
}

__global__ void gpuCallSort( )
{
int x = threadIdx.x;

sh_val[x] = buffer[x];
sh_val[x+32] = buffer[x+32];
__syncthreads();

int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );

popsift::BitonicSort::Warp32<int> sorter( sh_val );
sorter.sort64( best_index );
// sorter.sort32( threadIdx.x );
__syncthreads();

buffer[x] = sh_val[best_index.x];
buffer[x+32] = sh_val[best_index.y];
}

__host__ void callSort( )
{
dim3 block( 32, 1, 1 );

gpuCallSort<<<1,block>>>( );
cudaDeviceSynchronize();
}

};

9 changes: 9 additions & 0 deletions src/popsift/regression/test_radix_sort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include <cuda_runtime.h>

namespace TestRadix
{
__host__ void push( int* b );
__host__ void pull( int* b );
__host__ void callSort( );
};

Loading