diff --git a/CHANGES.md b/CHANGES.md index 9608b25d..1ec43666 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,10 +24,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for Cuda CC 7 cards (RTX 2080) [PR](https://github.com/alicevision/popsift/pull/67) - Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65) - Support for device selection and multiple GPUs [PR](https://github.com/alicevision/popsift/pull/121) +- Test: adding descriptor comparator [PR](https://github.com/alicevision/popsift/pull/133) +- Support descriptor computation compliant with VLFeat [PR](https://github.com/alicevision/popsift/pull/133) +- Support for ignoring Edge and Peak Thresholds [PR](https://github.com/alicevision/popsift/pull/133) +- Support for grid filtering on CUDA Managed Memory and sorting initial extrema on the CPU side [PR](https://github.com/alicevision/popsift/pull/133) ### Fixed - CMake: fixes to allow building on Windows using vcpkg [PR](https://github.com/alicevision/popsift/pull/92) - Fix race condition [PR](https://github.com/alicevision/popsift/pull/82) +- Fixed a bug in L2 normalization [PR](https://github.com/alicevision/popsift/pull/133) ### Changed - Improved resource releasing [PR](https://github.com/alicevision/popsift/pull/71) diff --git a/CMakeLists.txt b/CMakeLists.txt index f39f2fec..ccb416f5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,8 @@ if(MSVC) list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${PopSift_MVSC_LINKER}) endif() +list(APPEND CUDA_NVCC_FLAGS "-lineinfo") + # default stream per-thread implies that each host thread has one non-synchronizing 0-stream # currently, the code requires legacy mode list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy") @@ -181,9 +183,11 @@ else() endif() if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") - set(HAVE_SHFL_DOWN_SYNC 1) + set(PopSift_HAVE_SHFL_DOWN_SYNC 1) + set(PopSift_HAVE_COOPERATIVE_GROUPS 1) else() - set(HAVE_SHFL_DOWN_SYNC 0) + set(PopSift_HAVE_SHFL_DOWN_SYNC 0) + set(PopSift_HAVE_COOPERATIVE_GROUPS 0) endif() if(NOT PopSift_USE_GRID_FILTER) diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in index 427cfe42..beeed700 100644 --- a/cmake/sift_config.h.in +++ b/cmake/sift_config.h.in @@ -8,11 +8,13 @@ #pragma once -#define POPSIFT_IS_DEFINED(F) F() == 1 +#define POPSIFT_IS_DEFINED(F) F() == 1 +#define POPSIFT_IS_UNDEFINED(F) F() == 0 -#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @HAVE_SHFL_DOWN_SYNC@ -#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@ -#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ -#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ +#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@ +#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@ +#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ +#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ +#define POPSIFT_HAVE_COOPERATIVE_GROUPS() @PopSift_HAVE_COOPERATIVE_GROUPS@ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0380dd41..a6385fb8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,8 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_constants.cu popsift/sift_constants.h popsift/sift_conf.cu popsift/sift_conf.h popsift/gauss_filter.cu popsift/gauss_filter.h + popsift/filtergrid.cu popsift/filtergrid.h + popsift/initial_extremum.h popsift/s_image.cu popsift/s_image.h popsift/sift_pyramid.cu popsift/sift_pyramid.h popsift/sift_octave.cu popsift/sift_octave.h @@ -16,17 +18,18 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_extremum.h popsift/sift_extremum.cu popsift/s_extrema.cu popsift/s_orientation.cu - popsift/s_filtergrid.cu popsift/sift_desc.cu popsift/s_desc_loop.cu popsift/s_desc_loop.h popsift/s_desc_iloop.cu popsift/s_desc_iloop.h popsift/s_desc_grid.cu popsift/s_desc_grid.h popsift/s_desc_igrid.cu popsift/s_desc_igrid.h popsift/s_desc_notile.cu popsift/s_desc_notile.h + popsift/s_desc_vlfeat.cu popsift/s_desc_vlfeat.h popsift/s_desc_norm_rs.h popsift/s_desc_norm_l2.h popsift/s_desc_normalize.h popsift/s_gradiant.h + popsift/s_gradiant_cuda9plus.h popsift/s_solve.h popsift/common/assist.cu popsift/common/assist.h popsift/common/clamp.h @@ -149,3 +152,7 @@ if(PopSift_BUILD_EXAMPLES) add_subdirectory(application) endif() +if(PopSift_USE_TEST_CMD) + add_subdirectory(compareSiftFiles) +endif() + diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt index 3b28cec8..737d472c 100755 --- a/src/application/CMakeLists.txt +++ b/src/application/CMakeLists.txt @@ -68,7 +68,7 @@ endif(PopSift_USE_NVTX_PROFILING) # popsift-demo ############################################################# -add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h) +add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h common_opt.cpp) set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11) @@ -82,7 +82,7 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS}) # popsift-match ############################################################# -add_executable(popsift-match match.cpp pgmread.cpp pgmread.h) +add_executable(popsift-match match.cpp pgmread.cpp pgmread.h common_opt.cpp) set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11) diff --git a/src/application/common_opt.cpp b/src/application/common_opt.cpp new file mode 100755 index 00000000..229d35f2 --- /dev/null +++ b/src/application/common_opt.cpp @@ -0,0 +1,111 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "common_opt.h" + +// #include +// #include +// #include +// #include +// #include + +// #include + +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +#include + +using namespace std; +using namespace boost::program_options; + +void option_init_parameters( popsift::Config& config, options_description& parameters ) +{ + parameters.add_options() + ("octaves", + value(&config.octaves)->default_value(config.getOctaves()), + "Number of octaves") + ("levels", + value(&config.levels)->default_value(config.getLevels()), + "Number of levels per octave") + ("sigma", + value()->notifier([&](float f) { config.setSigma(f); })->default_value(config.getSigma()), + "Initial sigma value") + ("threshold", + value()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()), + popsift::Config::getPeakThreshUsage().c_str() ) + ("edge-threshold", + value()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()), + popsift::Config::getEdgeThreshUsage().c_str() ) + ("edge-limit", + value()->notifier([&](float f) { config.setEdgeLimit(f); }), + "synonym to --edge-threshold" ) + ("downsampling", + value()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()), + "Downscale width and height of input by 2^N") + ("initial-blur", + value()->notifier([&](float f) {config.setInitialBlur(f); })->default_value(config.getInitialBlur()), + "Assume initial blur, subtract when blurring first time"); +} + +void option_init_modes( popsift::Config& config, options_description& modes ) +{ + modes.add_options() + ( "gauss-mode", + value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), + popsift::Config::getGaussModeUsage() ) + // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " + // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" + ( "desc-mode", + value()->notifier([&](const std::string& s) { config.setDescMode(s); }), + popsift::Config::getDescModeUsage() ) + ( "popsift-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), + "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " + "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") + ( "vlfeat-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), + "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " + "In extrema refinement, steps up to 0.6, levels remain unchanged, " + "do not reject points when reaching max iterations, " + "first contrast threshold is .8 * peak thresh.") + ( "opencv-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), + "During the initial upscale, shift pixels by 0.5. " + "In extrema refinement, steps up to 0.5, " + "reject points when reaching max iterations, " + "first contrast threshold is floor(.5 * peak thresh). " + "Computed filter width are lower than VLFeat/PopSift") + ( "direct-scaling", + bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), + "Direct each octave from upscaled orig instead of blurred level.") + ( "norm-multi", + value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), + "Multiply the descriptor by pow(2,).") + ( "norm-mode", + value()->notifier([&](const std::string& s) { config.setNormMode(s); }), + popsift::Config::getNormModeUsage() ) + ( "root-sift", + bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), + "synonym to --norm-mode=RootSift" ) + ( "filter-max-extrema", + value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), + "Approximate max number of extrema.") + ( "filter-grid", + value()->notifier([&](int f) {config.setFilterGridSize(f); }), + "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") + ( "filter-sort", + value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), + popsift::Config::getFilterGridModeUsage() ); + +} + diff --git a/src/application/common_opt.h b/src/application/common_opt.h new file mode 100755 index 00000000..1da06100 --- /dev/null +++ b/src/application/common_opt.h @@ -0,0 +1,18 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include + +#include + +void option_init_parameters( popsift::Config& config, + boost::program_options::options_description& parameters ); +void option_init_modes( popsift::Config& config, + boost::program_options::options_description& modes ); + diff --git a/src/application/main.cpp b/src/application/main.cpp index 0eec1c22..31385992 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -29,6 +29,7 @@ #include #endif #include "pgmread.h" +#include "common_opt.h" #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) #include @@ -42,6 +43,7 @@ using namespace std; static bool print_dev_info = false; static bool print_time_info = false; static bool write_as_uchar = false; +static bool write_with_ori = false; static bool dont_write = false; static bool pgmread_loading = false; static bool float_mode = false; @@ -59,57 +61,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in ("input-file,i", value(&inputFile)->required(), "Input file"); } + options_description parameters("Parameters"); - { - parameters.add_options() - ("octaves", value(&config.octaves), "Number of octaves") - ("levels", value(&config.levels), "Number of levels per octave") - ("sigma", value()->notifier([&](float f) { config.setSigma(f); }), "Initial sigma value") - - ("threshold", value()->notifier([&](float f) { config.setThreshold(f); }), "Contrast threshold") - ("edge-threshold", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("edge-limit", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("downsampling", value()->notifier([&](float f) { config.setDownsampling(f); }), "Downscale width and height of input by 2^N") - ("initial-blur", value()->notifier([&](float f) {config.setInitialBlur(f); }), "Assume initial blur, subtract when blurring first time"); - } options_description modes("Modes"); - { - modes.add_options() - ( "gauss-mode", value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), - popsift::Config::getGaussModeUsage() ) - // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " - // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" - ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") - ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), - "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") - ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), - "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " - "In extrema refinement, steps up to 0.6, levels remain unchanged, " - "do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh.") - ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), - "During the initial upscale, shift pixels by 0.5. " - "In extrema refinement, steps up to 0.5, " - "reject points when reaching max iterations, " - "first contrast threshold is floor(.5 * peak thresh). " - "Computed filter width are lower than VLFeat/PopSift") - ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), - "Direct each octave from upscaled orig instead of blurred level.") - ("norm-multi", value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,).") - ( "norm-mode", value()->notifier([&](const std::string& s) { config.setNormMode(s); }), - popsift::Config::getNormModeUsage() ) - ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), - popsift::Config::getNormModeUsage() ) - ("filter-max-extrema", value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.") - ("filter-grid", value()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") - ("filter-sort", value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down"); - } + option_init_parameters( config, parameters ); + option_init_modes( config, modes ); + options_description informational("Informational"); { informational.add_options() @@ -118,6 +76,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in ("print-time-info", bool_switch(&print_time_info)->default_value(false), "A debug output printing image processing time after load()") ("write-as-uchar", bool_switch(&write_as_uchar)->default_value(false), "Output descriptors rounded to int.\n" "Scaling to sensible ranges is not automatic, should be combined with --norm-multi=9 or similar") + ("write-with-ori", bool_switch(&write_with_ori)->default_value(false), "Output points are written with sigma and orientation.\n") ("dont-write", bool_switch(&dont_write)->default_value(false), "Suppress descriptor output") ("pgmread-loading", bool_switch(&pgmread_loading)->default_value(false), "Use the old image loader instead of LibDevIL") ("float-mode", bool_switch(&float_mode)->default_value(false), "Upload image to GPU as float instead of byte") @@ -254,7 +213,7 @@ void read_job( SiftJob* job, bool really_write ) nvtxRangePushA( "Writing features to disk" ); std::ofstream of( "output-features.txt" ); - feature_list->print( of, write_as_uchar ); + feature_list->print( of, write_as_uchar, write_with_ori ); } delete feature_list; @@ -273,6 +232,8 @@ int main(int argc, char **argv) std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl; + config.setDescMode( popsift::Config::VLFeat_Desc ); + try { parseargs( argc, argv, config, inputFile ); // Parse command line std::cout << inputFile << std::endl; @@ -327,4 +288,3 @@ int main(int argc, char **argv) return EXIT_SUCCESS; } - diff --git a/src/application/match.cpp b/src/application/match.cpp index 852d9b62..400b6c7b 100755 --- a/src/application/match.cpp +++ b/src/application/match.cpp @@ -29,6 +29,7 @@ #include #endif #include "pgmread.h" +#include "common_opt.h" #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) #include @@ -45,7 +46,8 @@ static bool write_as_uchar {false}; static bool dont_write {false}; static bool pgmread_loading {false}; -static void parseargs(int argc, char** argv, popsift::Config& config, string& lFile, string& rFile) { +static void parseargs(int argc, char** argv, popsift::Config& config, string& lFile, string& rFile) +{ using namespace boost::program_options; options_description options("Options"); @@ -59,55 +61,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& lF ("right,r", value(&rFile)->required(), "\"Right\" input file"); } + options_description parameters("Parameters"); - { - parameters.add_options() - ("octaves", value(&config.octaves), "Number of octaves") - ("levels", value(&config.levels), "Number of levels per octave") - ("sigma", value()->notifier([&](float f) { config.setSigma(f); }), "Initial sigma value") - - ("threshold", value()->notifier([&](float f) { config.setThreshold(f); }), "Contrast threshold") - ("edge-threshold", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("edge-limit", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("downsampling", value()->notifier([&](float f) { config.setDownsampling(f); }), "Downscale width and height of input by 2^N") - ("initial-blur", value()->notifier([&](float f) {config.setInitialBlur(f); }), "Assume initial blur, subtract when blurring first time"); - } options_description modes("Modes"); - { - modes.add_options() - ( "gauss-mode", value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), - popsift::Config::getGaussModeUsage() ) - ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") - ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), - "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") - ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), - "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " - "In extrema refinement, steps up to 0.6, levels remain unchanged, " - "do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh.") - ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), - "During the initial upscale, shift pixels by 0.5. " - "In extrema refinement, steps up to 0.5, " - "reject points when reaching max iterations, " - "first contrast threshold is floor(.5 * peak thresh). " - "Computed filter width are lower than VLFeat/PopSift") - ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), - "Direct each octave from upscaled orig instead of blurred level.") - ("norm-multi", value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,).") - ( "norm-mode", value()->notifier([&](const std::string& s) { config.setNormMode(s); }), - popsift::Config::getNormModeUsage() ) - ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), - popsift::Config::getNormModeUsage() ) - ("filter-max-extrema", value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.") - ("filter-grid", value()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") - ("filter-sort", value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down"); - } + option_init_parameters( config, parameters ); + option_init_modes( config, modes ); + options_description informational("Informational"); { informational.add_options() @@ -227,6 +187,8 @@ int main(int argc, char **argv) std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl; + config.setDescMode( popsift::Config::VLFeat_Desc ); + try { parseargs( argc, argv, config, lFile, rFile ); // Parse command line std::cout << lFile << " <-> " << rFile << std::endl; @@ -267,7 +229,23 @@ int main(int argc, char **argv) cout << "Number of features: " << rFeatures->getFeatureCount() << endl; cout << "Number of descriptors: " << rFeatures->getDescriptorCount() << endl; - lFeatures->match( rFeatures ); + int3* matches = lFeatures->matchAndReturn( rFeatures ); + cudaDeviceSynchronize(); + + for( int i=0; igetDescriptorCount(); i++ ) + { + int3& match = matches[i]; + if( match.z ) + { + const popsift::Feature* l_f = lFeatures->getFeatureForDescriptor( i ); + const popsift::Feature* r_f = rFeatures->getFeatureForDescriptor( match.x ); + cout << setprecision(5) << showpoint + << "point (" << l_f->xpos << "," << l_f->ypos << ") in l matches " + << "point (" << r_f->xpos << "," << r_f->ypos << ") in r" << endl; + } + } + + cudaFree( matches ); delete lFeatures; delete rFeatures; diff --git a/src/compareSiftFiles/CMakeLists.txt b/src/compareSiftFiles/CMakeLists.txt new file mode 100755 index 00000000..1eb1d625 --- /dev/null +++ b/src/compareSiftFiles/CMakeLists.txt @@ -0,0 +1,38 @@ +if(NOT CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR) + # I am top-level project, i.e. I am not being include by another project + cmake_minimum_required(VERSION 3.12) + project(PopSiftCompareDescriptors LANGUAGES CXX) + + include(GNUInstallDirs) + + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") +endif() + +if(PopSift_BOOST_USE_STATIC_LIBS) + set(Boost_USE_STATIC_LIBS ON) +endif() +find_package(Boost 1.53.0 REQUIRED COMPONENTS filesystem program_options system) +if(WIN32) + add_definitions("-DBOOST_ALL_NO_LIB") +endif(WIN32) + +############################################################# +# compareSiftFiles +############################################################# + +add_executable(popsift-compareSiftFiles + compareSiftFiles.cpp + csf_feat.cpp csf_feat.h + csf_options.cpp csf_options.h) + +target_include_directories(popsift-compareSiftFiles PUBLIC ${Boost_INCLUDE_DIRS}) +target_link_libraries(popsift-compareSiftFiles PUBLIC ${Boost_LIBRARIES}) + +set_property(TARGET popsift-compareSiftFiles PROPERTY CXX_STANDARD 14) + +############################################################# +# installation +############################################################# + +install(TARGETS popsift-compareSiftFiles DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/src/compareSiftFiles/compareSiftFiles.cpp b/src/compareSiftFiles/compareSiftFiles.cpp new file mode 100755 index 00000000..b52ad9a7 --- /dev/null +++ b/src/compareSiftFiles/compareSiftFiles.cpp @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "csf_feat.h" +#include "csf_options.h" + +using namespace std; + +typedef vector desc_t; + +/** Read a file containing SIFT descriptors + */ +void readFile( vector& vec, const std::string& filename ); + +/** Write the average descriptor differences to file + * @param file The output file + * @param stats A 128-float vector containing the sum of differences + * @param sz The number of samples that have been collected in stats + */ +void writeSummary( ostream& file, const vector& stats, int sz ); + +/** Open a new stream or return the default stream. The default stream is + * returned if the name is empty or opening the stream fails. + * @param name A string containing the name of the file to open or empty + * @param default_stream The default stream to return + */ +ostream* openOutputFile( const string& name, ostream* default_stream ); + +int main( int argc, char* argv[] ) +{ + Parameters param; + + parseargs( argc, argv, param ); + + vector l_one; + vector l_two; + + readFile( l_one, param.input[0] ); + readFile( l_two, param.input[1] ); + + ostream* outfile = openOutputFile( param.outfile_name, &cout ); + ostream* descfile = openOutputFile( param.descfile_name, nullptr ); + + int len = l_one.size(); + int ct = 0; + float nextpercent = 10; + + vector desc_stats( 128, 0.0f ); + + ostream* print_dists = param.descfile_verbose ? descfile : 0; + + for( auto l : l_one ) + { + l.compareBestMatch( *outfile, print_dists, l_two, desc_stats, param.briefinfo ); + ct++; + if( float(ct * 100) / len >= float(nextpercent) ) + { + cerr << nextpercent << "% " << ct << endl; + nextpercent += 10; + } + } + + if( descfile ) + { + writeSummary( *descfile, desc_stats, l_one.size() ); + } + + if( ! param.outfile_name.empty() ) + { + delete outfile; + } +} + +void writeSummary( ostream& descfile, const vector& desc_stats, int sz ) +{ + descfile << "========== Summary Stats ==========" << endl + << "Average values:" << endl + << setprecision(3); + for( int i=0; i<128; i++ ) + { + if( i%32==0 ) descfile << "X=0 | "; + if( i%32==8 ) descfile << "X=1 | "; + if( i%32==16 ) descfile << "X=2 | "; + if( i%32==24 ) descfile << "X=3 | "; + float d = desc_stats[i] / sz; + descfile << setw(8) << d << " "; + if( i%8==7 ) descfile << endl; + } + descfile << endl; +} + +void readFile( vector& vec, const std::string& filename ) +{ + ifstream infile( filename ); + + if( ! infile.good() ) + { + cerr << "File " << filename << " is not open." << endl; + exit( -1 ); + } + + int lines_read = readFeats( vec, infile ); + cerr << "Read " << lines_read << " lines from " << filename << endl; +} + +ostream* openOutputFile( const string& outfile_name, ostream* default_stream ) +{ + ostream* outfile = default_stream; + if( outfile_name.empty() ) return outfile; + + ostream* o = new ofstream( outfile_name ); + if( o->good() ) + { + outfile = o; + } + else + { + delete o; + } + + return outfile; +} + diff --git a/src/compareSiftFiles/csf_feat.cpp b/src/compareSiftFiles/csf_feat.cpp new file mode 100755 index 00000000..1a3758c8 --- /dev/null +++ b/src/compareSiftFiles/csf_feat.cpp @@ -0,0 +1,272 @@ +#include +#include +#include +#include + +#include "csf_feat.h" + +#if __cplusplus >= 201703L +#define PAR_UNSEQ std::execution::par_unseq, +#else +#define PAR_UNSEQ +#endif + +#undef HAVE_STD_TRANSFORM_REDUCE + +using namespace std; + +const float M_PI2 = 2.0f * 3.14159265358979323846f; + +bool feat_t::_use_l2_distance = true; + +inline static float sum_of_squares( const feat_t& l, const feat_t& r ) +{ +#ifdef HAVE_STD_TRANSFORM_REDUCE +/* From C++17 numeric: +template +T transform_reduce(ExecutionPolicy&& policy, + ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, + T init, BinaryOp1 binary_op1, BinaryOp2 binary_op2); +*/ + const float sum = std::transform_reduce( + PAR_UNSEQ + l.desc, l.desc+DESC_SIZE, + r.desc, + 0.0f, + std::plus<>(), + [](float l, float r){ float v=l-r; return (v*v); }, + std::plus ); +#else + float sum = 0.0f; + + for( int i=0; i(), + [](float l, float r){ float v=l-r; return fabsf(v); }, + std::plus ); +#else + float sum = 0.0f; + + for( int i=0; i& l_one, ifstream& f_one ) +{ + char buffer[1024]; + int lines_read; + + lines_read = 0; + while( f_one.good() ) + { + f_one.getline( buffer, 1024 ); + if( f_one.good() ) + { + bool success = addFeat( l_one, buffer ); + if( success ) + { + lines_read++; + } + } + } + return lines_read; +} + +bool addFeat( vector& features, char* line ) +{ + vector values(5+DESC_SIZE); // 4 or 5 values followed by DESC_SIZE desc values + + int i = 0; + istringstream s( line ); + while( s >> values[i] ) + { + i++; + } + + // cerr << "Found " << i << " floats in line" << endl; + features.emplace_back( i, values ); + + if( i == 0 ) return false; + return true; +} + +feat_t::feat_t( int num, const vector& input ) + : desc(DESC_SIZE) +{ + auto it = input.begin(); + auto to = desc.begin(); + if( num == DESC_SIZE+4 ) + { + x = *it++; + y = *it++; + sigma = *it++; + ori = *it++; + for( int i=0; i::infinity() }; + float min2 { std::numeric_limits::infinity() }; + + int idx1; + int idx2; + + inline void update( const int& idx, const float& val ) + { + if( val < min1 ) + { + min2 = min1; + idx2 = idx1; + min1 = val; + idx1 = idx; + } + else if( val < min2 ) + { + min2 = val; + idx2 = idx; + } + } + + void takeSqrt( ) + { + min1 = sqrtf( min1 ); + min2 = sqrtf( min2 ); + } +}; + +void feat_t::compareBestMatch( ostream& ostr, ostream* dstr, const vector& l_one, vector& desc_stats, bool minOnly ) const +{ + const int l_one_sz = l_one.size(); + + if( !minOnly ) ostr << "==========" << endl; + + const feat_t& left( *this ); + + if( minOnly ) + { + Find2Best find2best; + + for( int j=0; j distances( l_one_sz ); + + for( int j=0; j +#include +#include + +#define DESC_SIZE 128 + +typedef std::vector desc_t; + +class feat_t +{ +public: + static bool _use_l2_distance; + + float x; + float y; + float sigma; + float ori; + desc_t desc; + + feat_t( int num, const std::vector& input ); + + void print( std::ostream& ostr ) const; + + void compareBestMatch( std::ostream& ostr, + std::ostream* dstr, + const std::vector& l_one, + std::vector& desc_stats, + bool minOnly ) const; + + static void setL2Distance( bool onoff ); +}; + +int readFeats( std::vector& l_one, + std::ifstream& f_one ); +bool addFeat( std::vector& features, char* line ); + diff --git a/src/compareSiftFiles/csf_options.cpp b/src/compareSiftFiles/csf_options.cpp new file mode 100755 index 00000000..922916ea --- /dev/null +++ b/src/compareSiftFiles/csf_options.cpp @@ -0,0 +1,100 @@ +#include + +#include + +#include "csf_options.h" + +using namespace std; + +void parseargs( int argc, char** argv, Parameters& param ) +{ + using namespace boost::program_options; + + bool longinfo = false; + + positional_options_description positional; + positional.add( "input", -1 ); + + options_description hidden_options("Hidden options"); + options_description options("Options"); + + { + hidden_options.add_options() + ("input,i", value >()->multitoken()->composing(), + "A descriptor file. 2 descriptor files must be passed"); + + options.add_options() + ("help,h", "Print usage") + ("verbose,v", value( &longinfo )->default_value(false), + "verbose, longer output") + ("output,o", value(¶m.outfile_name), + "print essential diff info to (default is cout)") + ("desc,d", value(¶m.descfile_name), + "print descriptor distance per cell to ") + ("dv", value(¶m.descfile_verbose)->default_value(false), + "prints every comparison into descfile, instead of summary only") + ("l1", bool_switch()->notifier( [&](bool b) + { + feat_t::setL2Distance( !b ); + } ) + ->default_value( false ), + "use L1 for distance computation instead of L2" ) + ; + } + + options_description all("Allowed options"); + + all.add(hidden_options) + .add(options); + + variables_map vm; + + try + { + + store( command_line_parser(argc, argv).options( all ) + .positional( positional ) + .run( ), + vm ); + + param.input = vm["input"].as >(); + + if( vm.count("help") ) + { + usage( argv[0], options ); + exit(EXIT_SUCCESS); + } + + if( param.input.size() != 2 ) + { + cerr << "Error: Exactly 2 input file for comparison must be provided." << endl; + usage( argv[0], options ); + exit(EXIT_FAILURE); + } + + notify(vm); // Notify does processing (e.g., raise exceptions if required args are missing) + } + catch(boost::program_options::error& e) + { + std::cerr << "Error: " << e.what() << std::endl << std::endl; + usage( argv[0], options ); + exit(EXIT_FAILURE); + } + + param.briefinfo = !longinfo; +} + +void usage( char* name, const boost::program_options::options_description& options ) +{ + cerr << endl + << "Usage: " << name << " [Options] " << endl + << endl + << "Description\n" + " Compute the L1 or L2 distance between the descriptors of closest\n" + " coordinate pairs.\n" + " When a coordinate has several orientations, the closest distance\n" + " is reported. Summary information at the end." << endl + << endl + << options << endl; +} + diff --git a/src/compareSiftFiles/csf_options.h b/src/compareSiftFiles/csf_options.h new file mode 100755 index 00000000..a82ae9c4 --- /dev/null +++ b/src/compareSiftFiles/csf_options.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +#include + +#include "csf_feat.h" + +struct Parameters +{ + std::vector input; + std::string outfile_name; + std::string descfile_name; + bool descfile_verbose; + bool briefinfo; +}; + +void parseargs( int argc, char** argv, Parameters& param ); +void usage( char* name, const boost::program_options::options_description& all ); + diff --git a/src/popsift/common/debug_macros.cu b/src/popsift/common/debug_macros.cu index cf4cd735..69acaa5a 100755 --- a/src/popsift/common/debug_macros.cu +++ b/src/popsift/common/debug_macros.cu @@ -62,6 +62,23 @@ void malloc_hst( void** ptr, int sz, } } } +namespace popsift { namespace cuda { +void malloc_mgd( void** ptr, int sz, + const char* file, int line ) +{ + cudaError_t err; + err = cudaMallocManaged( ptr, sz ); + if( err != cudaSuccess ) { + std::cerr << file << ":" << line << std::endl + << " cudaMallocManaged failed: " << cudaGetErrorString(err) << std::endl; + exit( -__LINE__ ); + } +#ifdef DEBUG_INIT_DEVICE_ALLOCATIONS + memset( *ptr, 0, sz ); +#endif // NDEBUG +} +} } + namespace popsift { namespace cuda { void memcpy_async( void* dst, const void* src, size_t sz, cudaMemcpyKind type, cudaStream_t stream, diff --git a/src/popsift/common/debug_macros.h b/src/popsift/common/debug_macros.h index dada7d3b..26a92904 100755 --- a/src/popsift/common/debug_macros.h +++ b/src/popsift/common/debug_macros.h @@ -37,6 +37,9 @@ void malloc_dev( void** ptr, int sz, void malloc_hst( void** ptr, int sz, const char* file, int line ); +void malloc_mgd( void** ptr, int sz, + const char* file, int line ); + template T* malloc_devT( int num, const char* file, int line ) { void* ptr; @@ -51,6 +54,17 @@ T* malloc_hstT( int num, const char* file, int line ) { return (T*)ptr; } +template +T* malloc_mgdT( int num, const char* file, int line ) { + void* ptr; + malloc_mgd( &ptr, num*sizeof(T), file, line ); + return (T*)ptr; +} + +inline void free_mgd( void* ptr ) { + cudaFree( ptr ); +} + void memcpy_sync( void* dst, const void* src, size_t sz, cudaMemcpyKind type, const char* file, size_t line ); diff --git a/src/popsift/features.cu b/src/popsift/features.cu index 023279ff..43837716 100755 --- a/src/popsift/features.cu +++ b/src/popsift/features.cu @@ -107,16 +107,16 @@ void FeaturesHost::unpin( ) cudaHostUnregister( _ori ); } -void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar ) const +void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const { for( int i=0; i ( num_ext, __FILE__, __LINE__ ); - _ori = popsift::cuda::malloc_devT( num_ori, __FILE__, __LINE__ ); - _rev = popsift::cuda::malloc_devT ( num_ori, __FILE__, __LINE__ ); + _ext = popsift::cuda::malloc_mgdT ( num_ext, __FILE__, __LINE__ ); + _ori = popsift::cuda::malloc_mgdT( num_ori, __FILE__, __LINE__ ); + _rev = popsift::cuda::malloc_mgdT ( num_ori, __FILE__, __LINE__ ); setFeatureCount( num_ext ); setDescriptorCount( num_ori ); @@ -240,22 +240,30 @@ show_distance( int3* match_matrix, const float4* lptr = (const float4*)( &l_ori[i] ); const float4* rptr1 = (const float4*)( &r_ori[match_matrix[i].x] ); const float4* rptr2 = (const float4*)( &r_ori[match_matrix[i].y] ); - float d1 = l2_in_t0( lptr, rptr1 ); - float d2 = l2_in_t0( lptr, rptr2 ); - if( threadIdx.x == 0 ) + float d1 = l2_in_t0( lptr, rptr1 ); + float d2 = l2_in_t0( lptr, rptr2 ); + if( threadIdx.x == 0 ) { if( match_matrix[i].z ) - printf( "accept feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f\n", + { + Feature* lx = &l_ext[l_fem[i]]; + Feature* rx = &r_ext[r_fem[match_matrix[i].x]]; + printf( "accept feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f" + " (%.1f,%.1f)-(%.1f,%.1f)\n", l_fem[i], i, r_fem[match_matrix[i].x], match_matrix[i].x, r_fem[match_matrix[i].y], match_matrix[i].y, - d1, d2 ); - else + d1, d2, + lx->xpos, lx->ypos, rx->xpos, rx->ypos ); + } + else + { printf( "reject feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f\n", l_fem[i], i, r_fem[match_matrix[i].x], match_matrix[i].x, r_fem[match_matrix[i].y], match_matrix[i].y, d1, d2 ); + } } __syncthreads(); } @@ -300,17 +308,77 @@ void FeaturesDev::match( FeaturesDev* other ) cudaFree( match_matrix ); } +int3* FeaturesDev::matchAndReturn( FeaturesDev* other ) +{ + int l_len = getDescriptorCount( ); + int r_len = other->getDescriptorCount( ); + + int3* match_matrix = popsift::cuda::malloc_mgdT( l_len, __FILE__, __LINE__ ); + + dim3 grid; + grid.x = l_len; + grid.y = 1; + grid.z = 1; + dim3 block; + block.x = 32; + block.y = 1; + block.z = 1; + + compute_distance + <<>> + ( match_matrix, getDescriptors(), l_len, other->getDescriptors(), r_len ); + + return match_matrix; +} + +Descriptor* FeaturesDev::getDescriptor( int descIndex ) +{ + return &_ori[descIndex]; +} + +const Descriptor* FeaturesDev::getDescriptor( int descIndex ) const +{ + return &_ori[descIndex]; +} + +Feature* FeaturesDev::getFeatureForDescriptor( int descIndex ) +{ + return &_ext[_rev[descIndex]]; +} + +const Feature* FeaturesDev::getFeatureForDescriptor( int descIndex ) const +{ + return &_ext[_rev[descIndex]]; +} + /************************************************************* * Feature *************************************************************/ -void Feature::print( std::ostream& ostr, bool write_as_uchar ) const +void Feature::print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const { float sigval = 1.0f / ( sigma * sigma ); for( int ori=0; orifeatures[i]) << " "; @@ -328,7 +396,7 @@ void Feature::print( std::ostream& ostr, bool write_as_uchar ) const std::ostream& operator<<( std::ostream& ostr, const Feature& feature ) { - feature.print( ostr, false ); + feature.print( ostr, false, false ); return ostr; } diff --git a/src/popsift/features.h b/src/popsift/features.h index 3b16f954..b2c2e967 100755 --- a/src/popsift/features.h +++ b/src/popsift/features.h @@ -33,7 +33,7 @@ struct Feature float orientation[ORIENTATION_MAX_COUNT]; Descriptor* desc[ORIENTATION_MAX_COUNT]; - void print( std::ostream& ostr, bool write_as_uchar ) const; + void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const; }; std::ostream& operator<<( std::ostream& ostr, const Feature& feature ); @@ -91,7 +91,7 @@ class FeaturesHost : public FeaturesBase inline Feature* getFeatures() { return _ext; } inline Descriptor* getDescriptors() { return _ori; } - void print( std::ostream& ostr, bool write_as_uchar ) const; + void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const; protected: friend class Pyramid; @@ -103,9 +103,9 @@ std::ostream& operator<<( std::ostream& ostr, const FeaturesHost& feature ); class FeaturesDev : public FeaturesBase { - Feature* _ext; - Descriptor* _ori; - int* _rev; // the reverse map from descriptors to extrema + Feature* _ext; // array of extrema + Descriptor* _ori; // array of desciptors + int* _rev; // the reverse map from descriptors to extrema public: FeaturesDev( ); @@ -114,11 +114,35 @@ class FeaturesDev : public FeaturesBase void reset( int num_ext, int num_ori ); + /** This function performs one-directional brute force matching on + * the GPU between the Descriptors in this objects and the object + * other. + * The resulting matches are printed. + */ void match( FeaturesDev* other ); + /** This function performs one-directional brute force matching on + * the GPU between the Descriptors in this objects and the object + * other. + * The resulting matches are returned in an array of int3 that must + * be released with a call to cudaFree(). + * The length of the array is this->getDescriptorCount(). + * For each element at position i + * i is the index of a descriptor in this->getDescriptors() + * int3.x is the index of the best match in other->getDescriptors() + * int3.y is the index of the second best match in other->getDescriptors() + * int3.z indicates if the match is valid (non-zero) or not (zero) + */ + int3* matchAndReturn( FeaturesDev* other ); + inline Feature* getFeatures() { return _ext; } inline Descriptor* getDescriptors() { return _ori; } inline int* getReverseMap() { return _rev; } + + Descriptor* getDescriptor( int descIndex ); + const Descriptor* getDescriptor( int descIndex ) const; + Feature* getFeatureForDescriptor( int descIndex ); + const Feature* getFeatureForDescriptor( int descIndex ) const; }; } // namespace popsift diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu new file mode 100755 index 00000000..f74ea8b5 --- /dev/null +++ b/src/popsift/filtergrid.cu @@ -0,0 +1,382 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "filtergrid.h" + +#include "sift_config.h" +#include "sift_extremum.h" +#include "sift_pyramid.h" +#include "common/debug_macros.h" +#include "common/assist.h" + +#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) + +#include + +#define PREFER_CPU 0 + +#if PREFER_CPU == 0 +#include +#include +#include +#include +#include +#endif + +namespace popsift +{ + +FilterGrid::FilterGrid( const Config& conf ) + : _slots( conf.getFilterGridSize() * conf.getFilterGridSize() ) + , _alloc_size( 0 ) + , _sorting_index( 0 ) + , _cells( 0 ) + , _scales( 0 ) + , _initial_extrema_pointers( 0 ) +{ + /* refresh the exclsuive prefix sum over the per-octave extrema counts */ + _histogram = popsift::cuda::malloc_mgdT( MAX_OCTAVES*_slots, __FILE__, __LINE__); + _histogram_full = popsift::cuda::malloc_mgdT( _slots, __FILE__, __LINE__); + _histogram_eps = popsift::cuda::malloc_mgdT( _slots+1, __FILE__, __LINE__); + + // initialize histogram to zero, implicitly move to device + popcuda_memset_sync( _histogram, 0, MAX_OCTAVES * _slots * sizeof(int) ); + popcuda_memset_sync( _histogram_full, 0, _slots * sizeof(int) ); +} + +FilterGrid::~FilterGrid( ) +{ + popsift::cuda::free_mgd( _histogram_eps ); + popsift::cuda::free_mgd( _histogram_full ); + popsift::cuda::free_mgd( _histogram ); + popsift::cuda::free_mgd( _sorting_index ); + popsift::cuda::free_mgd( _cells ); + popsift::cuda::free_mgd( _scales ); + popsift::cuda::free_mgd( _initial_extrema_pointers ); +} + +__global__ void +fg_init( const int octave, + ExtremaCounters* ct, + ExtremaBuffers* buf, + int* indices, + int* cells, + float* scales, + InitialExtremum** ext_compact ) +{ + const int my_idx_in_octave = blockIdx.x * blockDim.x + threadIdx.x; + const int extrema_in_octave = ct->ext_ct[octave]; + + if( my_idx_in_octave > extrema_in_octave ) return; + + const int base_index = ct->getExtremaBase( octave ); + const int my_idx = base_index + my_idx_in_octave; + + /* This is the array we are going to use for sorting and filtering and so on. */ + indices[my_idx] = my_idx; + + InitialExtremum* ie = &buf->i_ext_dat[octave][my_idx_in_octave]; + + cells[my_idx] = ie->getCell(); + scales[my_idx] = ie->getScale(); + + /* Keep pointers in this pointer array. The indirection is not elegant for CUDA, + * we can improve that later by making additional arrays for .cell and .scale + * values. + */ + ext_compact[my_idx] = ie; +} + +void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) +{ + _buf = buf; + _ct = ct; + + const int extrema_ct_total = _ct->getTotalExtrema(); + + /* Allocate or reallocate memory to hold all indices */ + if( _alloc_size < extrema_ct_total ) + { + if( _alloc_size > 0 ) + { + popsift::cuda::free_mgd( _sorting_index ); + popsift::cuda::free_mgd( _cells ); + popsift::cuda::free_mgd( _scales ); + popsift::cuda::free_mgd( _initial_extrema_pointers ); + } + _alloc_size = extrema_ct_total; + _sorting_index = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _cells = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _scales = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _initial_extrema_pointers = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + } + + popcuda_memset_sync( _histogram_full, 0, _slots * sizeof(int) ); + + for( int o=0; ogetExtremaCount(o); + const int extrema_base_in_octave = _ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; + + dim3 block( 32 ); + dim3 grid( grid_divide( extrema_ct_in_octave, block.x ) ); + fg_init + <<>> + ( o, + _ct, + _buf, + _sorting_index, + _cells, + _scales, + _initial_extrema_pointers ); + } +} + +__global__ void +fg_countcells( const int ext_total, + const int* cells, + const int cell_histogram_size, + int* cell_histogram ) +{ + // The size of this shared memory region is computed from _slots in the + // calling host code. + extern __shared__ int cellcounts[]; + + const int tid = threadIdx.x; + const int idx = blockIdx.x * blockDim.x + tid; + + for( int c=tid; c= ext_total ) return; + + const int cell = cells[idx]; + + atomicAdd( &cellcounts[cell], 1 ); + __syncthreads(); + + for( int c=tid; cgetTotalExtrema(); + + dim3 block( 32 ); + dim3 grid( grid_divide( total_extrema_count, block.x ) ); + + fg_countcells + <<>> + ( total_extrema_count, + _cells, + _slots, + _histogram_full ); +} + +void FilterGrid::make_histogram_prefix_sum( ) +{ + // We could to this on the device ! +#if PREFER_CPU == 0 + popcuda_memset_sync( _histogram_eps, 0, (_slots+1) * sizeof(int) ); + + thrust::device_ptr src_begin = thrust::device_pointer_cast( &_histogram_full[0] ); + thrust::device_ptr src_end = thrust::device_pointer_cast( &_histogram_full[_slots] ); + thrust::device_ptr dst_begin = thrust::device_pointer_cast( &_histogram_eps[1] ); + + thrust::inclusive_scan( src_begin, src_end, dst_begin ); +#else + cudaDeviceSynchronize(); + + _histogram_eps[0] = 0; + + for( int i=1; i<=_slots; i++ ) + { + _histogram_eps[i] = _histogram_eps[i-1] + _histogram_full[i-1]; + } +#endif +} + + +class CellScaleCompare +{ + int* _cells; + float* _scales; + +public: + CellScaleCompare( int* cell, float* scales ) + : _cells( cell ) + , _scales( scales ) + { } + + __host__ __device__ + inline bool operator()( int left, int right ) const + { + if( _cells[left] < _cells[right] ) return true; + if( ( _cells[left] == _cells[right] ) && + ( _scales[left] < _scales[right] ) ) return true; + return false; + } +}; + +void FilterGrid::sort_by_cell_and_scale( ) +{ + /* Thanks to managed memory, we can sort our indeces either on the + * host or on the device. Currently, we must use thrust for compiling + * for the device. With the HPC SDK, we will be able to use OpenACC + * and never know the difference. + */ + CellScaleCompare tc( _cells, _scales ); +#if PREFER_CPU == 0 + thrust::device_ptr ptr = thrust::device_pointer_cast( _sorting_index ); + thrust::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); +#else + cudaDeviceSynchronize(); + int* ptr = _sorting_index; + std::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); +#endif +} + +void FilterGrid::level_histogram( const int max_extrema ) +{ + int avg_in_cell = max_extrema / _slots; + + bool stable; + do + { + stable = true; + + int limited_slots = 0; + int sum = 0; + + for( int i=0; i<_slots; i++ ) + { + if( _histogram_full[i] > avg_in_cell ) + { + sum += avg_in_cell; + limited_slots += 1; + } + else + { + sum += _histogram_full[i]; + } + } + + const int diff = max_extrema - sum; + + if( limited_slots > 0 && diff > 0 ) + { + stable = false; + const std::div_t res = std::div( diff, limited_slots ); + const int widen = res.quot + ( res.rem > 0 ? 1 : 0 ); + + + avg_in_cell += widen; + } + } + while( !stable ); + + + for( int i=0; i<_slots; i++ ) + { + _histogram_full[i] = min( _histogram_full[i], avg_in_cell ); + } +} + +void FilterGrid::prune_extrema( GridFilterConfig::Mode mode ) +{ + for( int cell=0; cell<_slots; cell++ ) + { + const int base = _histogram_eps[cell]; + const int have = _histogram_eps[cell+1] - base; + const int want = _histogram_full[cell]; + const int diff = have - want; + if( diff <= 0 ) + { + continue; + } + + if( mode == GridFilterConfig::LargestScaleFirst ) + { + const int base = _histogram_eps[cell]; + for( int d=0; dsetIgnore(); + } + } + else // mode == GridFilterConfig::SmallestScaleFirst + { + const int base = _histogram_eps[cell+1]; + for( int d=0; dsetIgnore(); + } + } + } +} + +/* Discard some extrema that exceed a conf.getFilterMaxExtrema(). + * The caller should ensure that filtering is actually needed. */ +__host__ +int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf ) +{ + /* At this time, we have host-side information about ext_ct[o], the number + * of extrema we have found in octave o, and we have summed it up on the + * host size. However, other values in the hct and dct data structures + * have not been computed yet. + * The extrema are only known in the InitialExtrema structure. We want to + * perform grid filtering before their orientation is computed and they + * are copied into the larger Extrema data structure. + */ + const int max_extrema = conf.getFilterMaxExtrema( ); + + init( buf, ct ); + count_cells( ); + make_histogram_prefix_sum( ); + + sort_by_cell_and_scale( ); + + level_histogram( max_extrema ); + + prune_extrema( conf.getFilterSorting() ); + + for( int o=0; ogetExtremaCount(o); + + int counter = 0; + for( int i=0; ii_ext_dat[o][i]; + if( iext->isIgnored() == false ) + { + buf->i_ext_off[o][counter] = i; + counter += 1; + } + } + ct->ext_ct[o] = counter; + } + + ct->make_extrema_prefix_sums(); + + return ct->getTotalExtrema(); +} +}; // namespace popsift + +#endif // not defined(DISABLE_GRID_FILTER) + diff --git a/src/popsift/filtergrid.h b/src/popsift/filtergrid.h new file mode 100755 index 00000000..e1b19255 --- /dev/null +++ b/src/popsift/filtergrid.h @@ -0,0 +1,103 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include "sift_config.h" +#include "sift_conf.h" + +#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) + +namespace popsift +{ + +class ExtremaBuffers; +class ExtremaCounters; +class InitialExtremum; + +class FilterGrid +{ + int* _histogram; + int* _histogram_full; + int* _histogram_eps; // exclusive prefix sum; + + const int _slots; + ExtremaBuffers* _buf; + ExtremaCounters* _ct; + + int _alloc_size; + int* _sorting_index; + int* _cells; + float* _scales; + InitialExtremum** _initial_extrema_pointers; + +public: + FilterGrid( const Config& conf ); + + ~FilterGrid( ); + + int filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf ); + +private: + /** Initialize the FilterGrid structure with the current counters + * and buffers. + */ + void init( ExtremaBuffers* buf, ExtremaCounters* ct ); + + /** We are filling the _histogram_full by counting how frequent + * extrema appear in every cell. + * Since order is irrelevant for this operation, we ignore + * the indirection through the _sorting_index while we collect + * the histogram. + */ + void count_cells( ); + + /** To get the offsets for each cell into the sorted array + * _sorting_index, we can simple compute the exclusive prefix + * sum over _histogram_full. + * For convenience, it has an extra slot for the final sum. + */ + void make_histogram_prefix_sum( ); + + /** We sort the index of extrema (not the extrema themselves). + * The end result is in _sorting_index, which gives us indirect + * access (sorted by priority) to the arrays _cells, _scales + * and _initial_extrema_pointers. + */ + void sort_by_cell_and_scale( ); + + /** Compute a maximum nunmber of entries in every cell and + * update _histogram_full. + * Each cell has an equal share in the maximum number of + * extrema, but when a cell does not use its share fully, + * other cells can have it. The total number of extrema + * assigned can slightly exceed the given max value. + */ + void level_histogram( const int max_extrema ); + + void prune_extrema( GridFilterConfig::Mode mode ); +}; + +}; // namespace popsift + +#else // not defined(DISABLE_GRID_FILTER) + +namespace popsift +{ +class FilterGrid +{ +public: + FilterGrid( const Config& ) { } + + int filter( const Config&, ExtremaCounters*, ExtremaBuffers*, int ext_total ) { + return ext_total; + } +}; +}; // namespace popsift + +#endif // not defined(DISABLE_GRID_FILTER) + diff --git a/src/popsift/initial_extremum.h b/src/popsift/initial_extremum.h new file mode 100755 index 00000000..84e26cfa --- /dev/null +++ b/src/popsift/initial_extremum.h @@ -0,0 +1,94 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +// #include "sift_constants.h" + +namespace popsift { + +/** + * @brief This is an internal data structure. + * Separated from the final Extremum data structure to implement + * grid filtering in a space-efficient manner. In grid filtering, + * extrema are first found, after that some may be discarded in + * some spatial regions of the image. Avoid waste of space by + * allocating Extremum structures only for the remaining ones. + */ +class InitialExtremum +{ +public: + __device__ inline + void set( const int& oct, const float& x, const float& y, const float& z, const int& cellidx ) + { + ignore = false; + octave = oct; + xpos = x; + ypos = y; + if( z == 0 ) + { + lpos = 0; + sigma = 0.0f; + } + else + { + lpos = (int)roundf(z); + sigma = d_consts.sigma0 * pow(d_consts.sigma_k, z); + } + + cell = cellidx; + } + + __device__ inline const int& getOctave( ) const { return octave; } + __device__ inline const float& getX( ) const { return xpos; } + __device__ inline const float& getY( ) const { return ypos; } + __device__ inline const int& getLevel( ) const { return lpos; } + __device__ inline const float& getSigma( ) const { return sigma; } + __device__ inline const int& getCell( ) const { return cell; } + + /** Compute the accurate floating-point level within the image + * pyramid. It does not compensate for up- and downscaling of the + * input image but returns the value relative to the first octave + * that is actually computed. + * note: Online used once in grid filtering. Therefore not pre-computed. + */ + __device__ inline float getScale( ) const { + return sigma * powf( 2.0f, octave ); + } + + /** Set the ignore flag. Only used in grid filtering. */ + __host__ __device__ inline void setIgnore( ) { ignore = true; } + + /** Test the ignore flag. Only used in grid filtering. */ + __host__ __device__ inline bool isIgnored( ) const { return ignore; } + +private: + /// true if this extremum has been filtered + bool ignore; + + /** The octave where this point was found. */ + int octave; + + float xpos; + float ypos; + + /** This is the closest level of the octave for this initial + * extremum. It is rounded from the computed continuous level. + */ + int lpos; + + /** This is the accurate floating-point level within this octave. + * It requires a transformation of the "z" value that is derived + * from refining the z-position in the octave. + */ + float sigma; + + /// index into the grid for grid-based extrema filtering + int cell; +}; + +} // namespace popsift diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu index 099c4709..c1525166 100644 --- a/src/popsift/s_desc_grid.cu +++ b/src/popsift/s_desc_grid.cu @@ -121,15 +121,20 @@ void ext_desc_grid_sub( const int ix, } } -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex) +/* + * We assume that this is started with + * block = 16,4,4 + * grid = nunmber of orientations + */ +__global__ void ext_desc_grid( ExtremaBuffers* buf, int octave, int ori_base_index, cudaTextureObject_t layer_tex) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; + const int o_offset = ori_base_index + blockIdx.x; const int ix = threadIdx.y; const int iy = threadIdx.z; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -143,3 +148,33 @@ __global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex) layer_tex ); } +namespace popsift +{ + +bool start_ext_desc_grid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 16; + block.y = 4; + block.z = 4; + + ext_desc_grid + <<>> + ( buf, + octave, + ct->getOrientationBase( octave ), + oct_obj.getDataTexPoint( ) ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_grid.h b/src/popsift/s_desc_grid.h index c0919806..704ee2ad 100644 --- a/src/popsift/s_desc_grid.h +++ b/src/popsift/s_desc_grid.h @@ -12,39 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -/* - * We assume that this is started with - * block = 16,4,4 - * grid = nunmber of orientations - */ -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex); - namespace popsift { -inline static bool start_ext_desc_grid( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 16; - block.y = 4; - block.z = 4; - - ext_desc_grid - <<>> - ( octave, - oct_obj.getDataTexPoint( ) ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_grid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu index 9f77f12f..3424a968 100644 --- a/src/popsift/s_desc_igrid.cu +++ b/src/popsift/s_desc_igrid.cu @@ -13,6 +13,8 @@ #include +#define IGRID_NUMLINES 1 + using namespace popsift; __device__ static inline @@ -74,17 +76,23 @@ void ext_desc_igrid_sub( const float x, const float y, const int level, } } -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear) +/* + * We assume that this is started with + * block = 16,4,4 or with 32,4,4, depending on macros + * grid = nunmber of orientations + */ +__global__ void ext_desc_igrid( ExtremaBuffers* buf, const int ori_base_index, + cudaTextureObject_t texLinear) { - const int num = dct.ori_ct[octave]; + const int num = gridDim.x; const int offset = blockIdx.x * blockDim.z + threadIdx.z; - const int o_offset = dct.ori_ps[octave] + offset; + const int o_offset = ori_base_index + offset; if( offset >= num ) return; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; if( ext->sigma == 0 ) return; const float SBP = fabsf( DESC_MAGNIFY * ext->sigma ); @@ -104,3 +112,33 @@ __global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear) texLinear ); } +namespace popsift +{ + +bool start_ext_desc_igrid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 16; + block.y = 16; + block.z = IGRID_NUMLINES; + + ext_desc_igrid + <<>> + ( buf, + ct->getOrientationBase( octave ), + oct_obj.getDataTexLinear( ).tex ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift + diff --git a/src/popsift/s_desc_igrid.h b/src/popsift/s_desc_igrid.h index 8980a4bc..963c8daa 100644 --- a/src/popsift/s_desc_igrid.h +++ b/src/popsift/s_desc_igrid.h @@ -11,40 +11,9 @@ #include "sift_octave.h" #include "sift_pyramid.h" -/* - * We assume that this is started with - * block = 16,4,4 or with 32,4,4, depending on macros - * grid = nunmber of orientations - */ -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear); - namespace popsift { -#define IGRID_NUMLINES 1 - -inline static bool start_ext_desc_igrid( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 16; - block.y = 16; - block.z = IGRID_NUMLINES; - - ext_desc_igrid - <<>> - ( octave, - oct_obj.getDataTexLinear( ).tex ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_igrid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu index 84673a20..3abd517e 100644 --- a/src/popsift/s_desc_iloop.cu +++ b/src/popsift/s_desc_iloop.cu @@ -128,12 +128,12 @@ void ext_desc_iloop_sub( const float ang, } } -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_iloop( ExtremaBuffers* buf, int octave, const int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -147,3 +147,35 @@ __global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, h ); } +namespace popsift +{ + +bool start_ext_desc_iloop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 32; + block.y = 1; + block.z = 16; + + ext_desc_iloop + <<>> + ( buf, + octave, + ct->getOrientationBase( octave ), + oct_obj.getDataTexLinear( ).tex, + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_iloop.h b/src/popsift/s_desc_iloop.h index e69409b6..ac86b9e1 100644 --- a/src/popsift/s_desc_iloop.h +++ b/src/popsift/s_desc_iloop.h @@ -12,36 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int width, int height); - namespace popsift { -inline static bool start_ext_desc_iloop( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 32; - block.y = 1; - block.z = 16; - - ext_desc_iloop - <<>> - ( octave, - oct_obj.getDataTexLinear( ).tex, - oct_obj.getWidth(), - oct_obj.getHeight() ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_iloop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu index 4c5f46c2..99b7790f 100644 --- a/src/popsift/s_desc_loop.cu +++ b/src/popsift/s_desc_loop.cu @@ -13,6 +13,8 @@ #include +#undef BLOCK_3_DIMS + using namespace popsift; __device__ static inline @@ -138,12 +140,12 @@ void ext_desc_loop_sub( const float ang, } } -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_loop( ExtremaBuffers* buf, int octave, int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -157,3 +159,41 @@ __global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, h ); } +namespace popsift +{ + +bool start_ext_desc_loop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + +#ifndef BLOCK_3_DIMS + block.x = 32; + block.y = 4; + block.z = 4; +#else + block.x = 32; + block.y = 1; + block.z = 16; +#endif + + ext_desc_loop + <<>> + ( buf, + octave, + ct->getOrientationBase( octave ), + oct_obj.getDataTexPoint( ), + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_loop.h b/src/popsift/s_desc_loop.h index 600db498..3c562b45 100644 --- a/src/popsift/s_desc_loop.h +++ b/src/popsift/s_desc_loop.h @@ -12,44 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -#undef BLOCK_3_DIMS - -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int width, int height); - namespace popsift { -inline static bool start_ext_desc_loop( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - -#ifndef BLOCK_3_DIMS - block.x = 32; - block.y = 4; - block.z = 4; -#else - block.x = 32; - block.y = 1; - block.z = 16; -#endif - - ext_desc_loop - <<>> - ( octave, - oct_obj.getDataTexPoint( ), - oct_obj.getWidth(), - oct_obj.getHeight() ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_loop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h index 3a7ed858..96a161d4 100644 --- a/src/popsift/s_desc_norm_l2.h +++ b/src/popsift/s_desc_norm_l2.h @@ -58,15 +58,15 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool float norm; if( threadIdx.x == 0 ) { - norm = normf( 128, src_desc ); + norm = rnormf( 128, src_desc ); // 1/sqrt(sum of squares) } __syncthreads(); norm = popsift::shuffle( norm, 0 ); - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); + descr.x = min( descr.x*norm, 0.2f ); + descr.y = min( descr.y*norm, 0.2f ); + descr.z = min( descr.z*norm, 0.2f ); + descr.w = min( descr.w*norm, 0.2f ); norm = descr.x * descr.x + descr.y * descr.y @@ -96,14 +96,14 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); if( threadIdx.x == 0 ) { - norm = __fsqrt_rn( norm ); + norm = __frsqrt_rn( norm ); } norm = popsift::shuffle( norm, 0 ); - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); + descr.x = min( descr.x*norm, 0.2f ); + descr.y = min( descr.y*norm, 0.2f ); + descr.z = min( descr.z*norm, 0.2f ); + descr.w = min( descr.w*norm, 0.2f ); norm = descr.x * descr.x + descr.y * descr.y diff --git a/src/popsift/s_desc_normalize.h b/src/popsift/s_desc_normalize.h index a87d0710..5e18b7a4 100644 --- a/src/popsift/s_desc_normalize.h +++ b/src/popsift/s_desc_normalize.h @@ -13,22 +13,19 @@ template __global__ -void normalize_histogram( ) +void normalize_histogram( Descriptor* descs, const int num_orientations ) { - Descriptor* descs = dbuf.desc; - const int num_orientations = dct.ori_total; - - int offset = blockIdx.x * 32 + threadIdx.y; + int offset = blockIdx.x * blockDim.y + threadIdx.y; // all of these threads are useless - if( blockIdx.x * 32 >= num_orientations ) return; + if( blockIdx.x * blockDim.y >= num_orientations ) return; + + offset = min( offset, num_orientations-1 ); - offset = ( offset < num_orientations ) ? offset - : num_orientations-1; - Descriptor* desc = &descs[offset]; + float* features = descs[offset].features; bool ignoreme = ( offset >= num_orientations ); - T::normalize( desc->features, ignoreme ); + T::normalize( features, ignoreme ); } diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu index a336898b..992b0810 100644 --- a/src/popsift/s_desc_notile.cu +++ b/src/popsift/s_desc_notile.cu @@ -95,19 +95,19 @@ __global__ // __launch_bounds__(192) // 56/threads // no -- __launch_bounds__(128) // 63/thread // no -- no launch bound // 64/thread/thread -void ext_desc_notile( const int octave, +void ext_desc_notile( ExtremaBuffers* buf, + const int ori_count, + const int ori_base_index, cudaTextureObject_t texLinear ) { - const int num = dct.ori_ct[octave]; - const int offset = blockIdx.x * BLOCK_Z_NOTILE + threadIdx.z; - const int o_offset = dct.ori_ps[octave] + offset; - if( offset >= num ) return; + const int o_offset = ori_base_index + offset; + if( offset >= ori_count ) return; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; if( ext->sigma == 0 ) return; const float SBP = fabsf( DESC_MAGNIFY * ext->sigma ); @@ -130,7 +130,7 @@ void ext_desc_notile( const int octave, namespace popsift { -bool start_ext_desc_notile( int octave, Octave& oct_obj ) +bool start_ext_desc_notile( const ExtremaCounters* ct, ExtremaBuffers* buf, int octave, Octave& oct_obj ) { dim3 block; dim3 grid; @@ -139,7 +139,7 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj ) block.y = 4; block.z = BLOCK_Z_NOTILE; - grid.x = grid_divide( hct.ori_ct[octave], block.z ); + grid.x = grid_divide( ct->ori_ct[octave], block.z ); grid.y = 1; grid.z = 1; @@ -147,7 +147,9 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj ) ext_desc_notile <<>> - ( octave, + ( buf, + ct->ori_ct[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexLinear( ).tex ); cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError( ); diff --git a/src/popsift/s_desc_notile.h b/src/popsift/s_desc_notile.h index 93a91cc3..2474f63c 100644 --- a/src/popsift/s_desc_notile.h +++ b/src/popsift/s_desc_notile.h @@ -14,6 +14,6 @@ namespace popsift { -bool start_ext_desc_notile( int octave, Octave& oct_obj ); +bool start_ext_desc_notile( const ExtremaCounters* ct, ExtremaBuffers* buf, int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu new file mode 100644 index 00000000..ba85e8c4 --- /dev/null +++ b/src/popsift/s_desc_vlfeat.cu @@ -0,0 +1,258 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "popsift/sift_config.h" + +#include "common/assist.h" +#include "common/debug_macros.h" +#include "common/vec_macros.h" +#include "s_desc_vlfeat.h" +#include "s_gradiant.h" +#include "sift_constants.h" +#include "sift_pyramid.h" + +#include + +using namespace popsift; + +__device__ static inline +void ext_desc_vlfeat_sub( const float ang, + const Extremum* ext, + float* __restrict__ features, + cudaTextureObject_t layer_tex, + const int width, + const int height ) +{ + const float x = ext->xpos; + const float y = ext->ypos; + const int level = ext->lpos; // old_level; + const float sig = ext->sigma; + const float SBP = fabsf(DESC_MAGNIFY * sig); + + if( SBP == 0 ) { + return; + } + + float cos_t; + float sin_t; + __sincosf( ang, &sin_t, &cos_t ); + + const float csbp = cos_t * SBP; + const float ssbp = sin_t * SBP; + const float crsbp = cos_t / SBP; + const float srsbp = sin_t / SBP; + + // We have 4x4*16 bins. + // There centers have the offsets -1.5, -0.5, 0.5, 1.5 from the + // keypoint. The points that support them stretch from -2 to 2 + const float2 maxdist = make_float2( -2.0f, -2.0f ); + + // We rotate the corner of the maximum range by the keypoint orientation. + // const float ptx = csbp * maxdist - ssbp * maxdist; + // const float pty = csbp * maxdist + ssbp * maxdist; + const float ptx = fabsf( ::fmaf( csbp, maxdist.x, -ssbp * maxdist.y ) ); + const float pty = fabsf( ::fmaf( csbp, maxdist.y, ssbp * maxdist.x ) ); + + const float bsz = 2.0f * ( fabsf(csbp) + fabsf(ssbp) ); + + const int xmin = max(1, (int)floorf(x - ptx - bsz)); + const int ymin = max(1, (int)floorf(y - pty - bsz)); + const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); + const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); + + __shared__ float dpt[128]; + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>( block ); + + for( int i=tile.thread_rank(); i<128; i+=tile.size() ) + { + dpt[i] = 0.0f; + } + + tile.sync(); +#else + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + dpt[i] = 0.0f; + } + + __syncthreads(); +#endif + + for( int pix_y = ymin; pix_y <= ymax; pix_y += 1 ) + { + for( int base_x = xmin; base_x <= xmax; base_x += 32 ) + { + float mod; + float th; + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + get_gradiant32( tile, mod, th, base_x, pix_y, layer_tex, level ); +#else + get_gradiant32( mod, th, base_x, pix_y, layer_tex, level ); +#endif + + mod /= 2.0f; // Our mod is double that of vlfeat. Huh. + + th -= ang; + while( th > M_PI2 ) th -= M_PI2; + while( th < 0.0f ) th += M_PI2; +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + tile.sync(); + + const int pix_x = base_x + tile.thread_rank(); +#else + __syncthreads(); + + const int pix_x = base_x + threadIdx.x; +#endif + + if( ( pix_y <= ymax ) && ( pix_x <= xmax ) ) + { +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + cg::coalesced_group co_tile = cg::coalesced_threads(); +#endif + + // d : distance from keypoint + const float2 d = make_float2( pix_x - x, pix_y - y ); + + // n : normalized distance from keypoint + const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), + ::fmaf( crsbp, d.y, -srsbp * d.x ) ); + + const float ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3)); + + const float nt = 8.0f * th / M_PI2; + + // neighbouring tile on the lower side: -2, -1, 0 or 1 + // (must use floorf because casting rounds towards zero + const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), + (int)floorf(n.y - 0.5f), + (int)nt ); + const float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); + const float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); + const float wgt_t = - ( nt - t0.z ); + + for( int tx=0; tx<2; tx++ ) + { + for( int ty=0; ty<2; ty++ ) + { + for( int tt=0; tt<2; tt++ ) + { + if( ( t0.y + ty >= -2 ) && + ( t0.y + ty < 2 ) && + ( t0.x + tx >= -2 ) && + ( t0.x + tx < 2 ) ) + { + float i_wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; + float i_wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; + float i_wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; + + i_wgt_x = fabsf( i_wgt_x ); + i_wgt_y = fabsf( i_wgt_y ); + i_wgt_t = fabsf( i_wgt_t ); + + const float val = ww + * mod + * i_wgt_x + * i_wgt_y + * i_wgt_t; + + const int offset = 80 + + ( t0.y + ty ) * 32 + + ( t0.x + tx ) * 8 + + ( t0.z + tt ) % 8; + + atomicAdd( &dpt[offset], val ); + } + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + co_tile.sync(); +#else + // cannot be done before CUDA 9 +#endif + } + } + } + } +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + tile.sync(); +#else + __syncthreads(); +#endif + } + } + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + for( int i=tile.thread_rank(); i<128; i+=tile.size() ) + { + features[i] = dpt[i]; + } +#else + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + features[i] = dpt[i]; + } +#endif +} + +__global__ void ext_desc_vlfeat( ExtremaBuffers* buf, const int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) +{ + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; + + const int ext_base = ext->idx_ori; + const int ori_num = o_offset - ext_base; + const float ang = ext->orientation[ori_num]; + + ext_desc_vlfeat_sub( ang, + ext, + desc->features, + layer_tex, + w, + h ); +} + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 32; + block.y = 1; + block.z = 1; + + size_t shared_size = 4 * 128 * sizeof(float); + + ext_desc_vlfeat + <<>> + ( buf, + ct->getOrientationBase( octave ), + oct_obj.getDataTexPoint( ), + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift + diff --git a/src/popsift/s_desc_vlfeat.h b/src/popsift/s_desc_vlfeat.h new file mode 100644 index 00000000..2dca9c04 --- /dev/null +++ b/src/popsift/s_desc_vlfeat.h @@ -0,0 +1,19 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "sift_octave.h" +#include "sift_pyramid.h" + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); + +}; // namespace popsift + diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index 5c1acc44..d79daf15 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -299,7 +299,7 @@ public: template __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, - int debug_octave, + int octave, int width, int height, uint32_t maxlevel, @@ -308,10 +308,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, int grid_width, InitialExtremum& ec) { - ec.xpos = 0.0f; - ec.ypos = 0.0f; - ec.lpos = 0; - ec.sigma = 0.0f; + ec.set( octave, 0.0f, 0.0f, 0.0f, 0 ); /* * First consideration: extrema cannot be found on any outermost edge, @@ -344,7 +341,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, if( ! f.first_contrast_ok( val ) ) return false; if( ! is_extremum( dog, x-1, y-1, level-1 ) ) { - // if( debug_octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n"); + // if( octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n"); return false; } @@ -482,22 +479,26 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, /* accept-reject extremum */ // if( fabsf(contr) < (d_consts.threshold*2.0f) ) - if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) ) + if( d_consts.threshold > 0.0f ) { - return false; + if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) ) + { + return false; + } } - /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */ - if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit ) { - return false; + if( d_consts.edge_limit > 0.0f ) + { + /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */ + if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit ) + { + return false; + } } - ec.xpos = xn; - ec.ypos = yn; - ec.lpos = (int)roundf(sn); - ec.sigma = d_consts.sigma0 * pow(d_consts.sigma_k, sn); // * 2; - ec.cell = floorf( yn / h_grid_divider ) * grid_width + floorf( xn / w_grid_divider ); - // const float sigma_k = powf(2.0f, 1.0f / levels ); + ec.set( octave, + xn, yn, sn, + floorf( yn / h_grid_divider ) * grid_width + floorf( xn / w_grid_divider ) ); return true; } @@ -510,6 +511,8 @@ void find_extrema_in_dog( cudaTextureObject_t dog, int width, int height, const uint32_t maxlevel, + ExtremaCounters* ct, + ExtremaBuffers* buf, int* d_number_of_blocks, int number_of_blocks, const float w_grid_divider, @@ -517,7 +520,6 @@ void find_extrema_in_dog( cudaTextureObject_t dog, const int grid_width ) { InitialExtremum ec; - ec.ignore = false; bool indicator = find_extrema_in_dog_sub( dog, octave, @@ -529,13 +531,12 @@ void find_extrema_in_dog( cudaTextureObject_t dog, grid_width, ec ); - uint32_t write_index = extrema_count( indicator, &dct.ext_ct[octave] ); + uint32_t write_index = extrema_count( indicator, &ct->ext_ct[octave] ); - InitialExtremum* d_extrema = dobuf.i_ext_dat[octave]; - int* d_ext_off = dobuf.i_ext_off[octave]; + InitialExtremum* d_extrema = buf->i_ext_dat[octave]; + int* d_ext_off = buf->i_ext_off[octave]; if( indicator && write_index < d_consts.max_extrema ) { - ec.write_index = write_index; // store the initial extremum in an array d_extrema[write_index] = ec; @@ -549,10 +550,13 @@ void find_extrema_in_dog( cudaTextureObject_t dog, __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { - int ct = atomicAdd( d_number_of_blocks, 1 ); - if( ct >= number_of_blocks-1 ) { - int num_ext = atomicMin( &dct.ext_ct[octave], d_consts.max_extrema ); - // printf( "Block %d,%d,%d num ext %d\n", blockIdx.x, blockIdx.y, blockIdx.z, dct.ext_ct[octave] ); + // This works as a global barrier. Only thread 0 of the last thread block + // that finishes will call the atomicMin function. + // The atomicMin enforces an upper limit on the octave's number of extrema. + int count = atomicAdd( d_number_of_blocks, 1 ); + if( count >= number_of_blocks-1 ) { + int num_ext = atomicMin( &ct->ext_ct[octave], d_consts.max_extrema ); + // printf( "Block %d,%d,%d num ext %d\n", blockIdx.x, blockIdx.y, blockIdx.z, ct->ext_ct[octave] ); } } } @@ -595,6 +599,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), @@ -610,6 +616,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), @@ -625,6 +633,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), @@ -639,5 +649,15 @@ void Pyramid::find_extrema( const Config& conf ) } } +void Pyramid::finalize_extrema( ) +{ + /* Complete the extrema computation by computing the prefix sum of + * extrema counts for all octaves. + */ + readDescCountersFromDevice( ); + + _ct->make_extrema_prefix_sums(); +} + } // namespace popsift diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu deleted file mode 100644 index 301c6a96..00000000 --- a/src/popsift/s_filtergrid.cu +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Copyright 2016-2017, Simula Research Laboratory - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ -#include "sift_config.h" -#include "sift_extremum.h" -#include "sift_pyramid.h" - -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - -#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace popsift -{ - -struct FunctionSort_IncCell_DecScale -{ - __device__ - inline bool operator()( const thrust::tuple& l, const thrust::tuple& r ) const - { - return ( ( thrust::get<0>(l) < thrust::get<0>(r) ) || - ( thrust::get<0>(l) == thrust::get<0>(r) && thrust::get<1>(l) > thrust::get<1>(r) ) ); - } -}; - -struct FunctionSort_IncCell_IncScale -{ - __device__ - inline bool operator()( const thrust::tuple& l, const thrust::tuple& r ) const - { - return ( ( thrust::get<0>(l) < thrust::get<0>(r) ) || - ( thrust::get<0>(l) == thrust::get<0>(r) && thrust::get<1>(l) < thrust::get<1>(r) ) ); - } -}; - -struct FunctionExtractCell -{ - __device__ - inline thrust::tuple operator()( const thrust::tuple& val) const - { - /* During the filter stage, the i_ext_dat array is still compact (all intial - * extrema do still have ignore==false), so that we can access every entry - * directly). - */ - const int octave = thrust::get<0>(val); - const int idx = thrust::get<1>(val); - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; - - return thrust::make_tuple( e.cell, e.sigma * powf( 2.0f, octave ) ); - } -}; - -struct FunctionIsAbove -{ - int _limit; - explicit FunctionIsAbove( int limit ) : _limit(limit) { } - - __host__ __device__ - inline bool operator()( int val ) const - { - return val > _limit; - } -}; - -struct FunctionDisableExtremum -{ - __device__ - inline void operator()( const thrust::tuple& val) const - { - const int octave = thrust::get<0>(val); - const int idx = thrust::get<1>(val); - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; - e.ignore = true; - } -}; - -struct FunctionExtractIgnored -{ - __device__ - inline int operator()( int idx, int octave ) const - { - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; - if( e.ignore ) - return 0; - else - return 1; - } -}; - -/* discard extrema that exceed a conf.getFilterMaxExtrema() */ -__host__ -int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) -{ - /* At this time, we have host-side information about ext_ct[o], the number - * of extrema we have found in octave o, and we have summed it up on the - * host size. However, other values in the hct and dct data structures - * have not been computed yet. - * The extrema are only known in the InitialExtrema structure. We want to - * perform grid filtering before their orientation is computed and they - * are copied into the larger Extrema data structure. - */ - const int slots = conf.getFilterGridSize(); - - thrust::device_vector octave_index( ext_total ); - thrust::device_vector iext_index ( ext_total ); - thrust::device_vector cell_values ( ext_total ); - thrust::device_vector scale_values( ext_total ); - thrust::device_vector cell_counts ( slots * slots ); - thrust::device_vector cell_offsets( slots * slots ); - - int sum = 0; - for( int o=0; o 0 ) { - cudaStream_t oct_str = _octaves[o].getStream(); - - // fill a continuous device array with octave of all initial extrema - thrust::fill( thrust::cuda::par.on(oct_str), - octave_index.begin() + sum, - octave_index.begin() + sum + ocount, - o ); - // fill a continuous device array with index within octave of all initial extrema - thrust::sequence( thrust::cuda::par.on(oct_str), - iext_index.begin() + sum, - iext_index.begin() + sum + ocount ); - sum += ocount; - } - } - - cudaDeviceSynchronize(); - - // extract cell and scale value for all initial extrema - FunctionExtractCell fun_extract_cell; - - thrust::transform( thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.end(), - iext_index.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - fun_extract_cell ); - if( conf.getFilterSorting() == Config::LargestScaleFirst ) - { - FunctionSort_IncCell_DecScale fun_sort; - thrust::sort_by_key( - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.end(), - scale_values.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin() ) ), - fun_sort ); - } - else if( conf.getFilterSorting() == Config::SmallestScaleFirst ) - { - FunctionSort_IncCell_IncScale fun_sort; - thrust::sort_by_key( - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.end(), - scale_values.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin() ) ), - fun_sort ); - } - else - { - // sort (octave,index,scale) tuples by their cell values (in which cell are they located) - thrust::sort_by_key( - cell_values.begin(), - cell_values.end(), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin(), - scale_values.begin() ) ) ); - } - - // count the number of entries in all cells (in one operation instead of several reduce_if) - thrust::reduce_by_key( cell_values.begin(), cell_values.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - cell_counts.begin() ); - - // compute the offsets from cell_values start for each of the (pre-sorted) cell values - thrust::exclusive_scan( cell_counts.begin(), cell_counts.end(), cell_offsets.begin() ); - - const int n = slots * slots; - thrust::host_vector h_cell_counts ( n ); - thrust::host_vector h_cell_permute ( n ); - thrust::host_vector h_cell_offsets ( n ); - thrust::host_vector h_cell_limits ( n ); - thrust::host_vector cell_count_prefix_sums( n ); - thrust::host_vector cell_count_sumup ( n ); - - // move to host code - computing the limits on the GPU is too wasteful - h_cell_counts = cell_counts; - - // offset to beginning of each cell value - recompute faster than copy - thrust::exclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), h_cell_offsets.begin() ); - - // offset to end indeces of each cell value - could shift h_cell_offsets and sum up counts - thrust::inclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), h_cell_limits .begin() ); - - // the cell filter algorithm requires the cell counts in increasing order, cell_permute - // maps new position to original index - thrust::sequence( h_cell_permute.begin(), h_cell_permute.end() ); - thrust::sort_by_key( h_cell_counts.begin(), h_cell_counts.end(), h_cell_permute.begin() ); - - // several steps to find the cells that must loose extrema - - // inclusive prefix sum - thrust::inclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), cell_count_prefix_sums.begin() ); - - thrust::host_vector h_reverse_index(n); - thrust::sequence( h_reverse_index.begin(), h_reverse_index.end(), - n-1, - -1 ); - - // sumup[i] = prefix sum[i] + sum( cell[i] copied into remaining cells ) - thrust::transform( h_cell_counts.begin(), h_cell_counts.end(), - h_reverse_index.begin(), - cell_count_sumup.begin(), - thrust::multiplies() ); - thrust::transform( cell_count_sumup.begin(), cell_count_sumup.end(), - cell_count_prefix_sums.begin(), - cell_count_sumup.begin(), - thrust::plus() ); - - FunctionIsAbove function_is_above( conf.getFilterMaxExtrema() ); - - // count cells that are above the extrema limit after the summing. Those must share the - // reduction of extrema - int ct = thrust::count_if( cell_count_sumup.begin(), cell_count_sumup.end(), - function_is_above ); - - float tailaverage = float( thrust::reduce( &h_cell_counts[n-ct], &h_cell_counts[n] ) ) / ct; - - int newlimit = ::ceilf( tailaverage - ( ext_total - conf.getFilterMaxExtrema() ) / ct ); - - // clamp all cells to the computed limit - the total is now less than n extrema off - thrust::transform( h_cell_counts.begin(), h_cell_counts.end(), - thrust::make_constant_iterator(newlimit), - h_cell_counts.begin(), - thrust::minimum() ); - - // back to original order - thrust::sort_by_key( h_cell_permute.begin(), h_cell_permute.end(), h_cell_counts.begin() ); - - // transfer counts back to device - cell_counts = h_cell_counts; - - for( int i=0; i grid( ext_total ); - - int ret_ext_total = 0; - - for( int o=0; o 0 ) { - FunctionExtractIgnored fun_extract_ignore; - thrust::identity fun_id; - - grid.resize( ocount ); - - thrust::transform( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(ocount), - thrust::make_constant_iterator(o), - grid.begin(), - fun_extract_ignore ); - - thrust::device_ptr off_ptr = thrust::device_pointer_cast( dobuf_shadow.i_ext_off[o] ); - - thrust::copy_if( thrust::make_counting_iterator(0), - thrust::make_counting_iterator(ocount), - grid.begin(), - off_ptr, - fun_id ); - - hct.ext_ct[o] = thrust::reduce( grid.begin(), grid.end() ); - - ret_ext_total += hct.ext_ct[o]; - } - } - - nvtxRangePushA( "writing back count" ); - writeDescCountersToDevice( ); - nvtxRangePop( ); - - return ret_ext_total; -} -}; // namespace popsift - -#else // not defined(DISABLE_GRID_FILTER) - -namespace popsift -{ -/* do nothing unless we have CUDA v 8 or newer */ -__host__ -int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) -{ - return ext_total; -} -}; // namespace popsift - -#endif // not defined(DISABLE_GRID_FILTER) - diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h index aaec9e2d..3f648cf5 100644 --- a/src/popsift/s_gradiant.h +++ b/src/popsift/s_gradiant.h @@ -14,6 +14,10 @@ #include #include +#include "popsift/sift_config.h" + +#include "s_gradiant_cuda9plus.h" // functions that work with CUDA SDK 9 and later + namespace popsift { /* @@ -68,6 +72,39 @@ void get_gradiant( float& grad, theta = atan2f(dy, dx); } +#if POPSIFT_IS_UNDEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) +/* A version of get_gradiant that works for a (32,1,1) threadblock + * and pulls data to shared memory before computing. Data is pulled + * less frequently, meaning that we do not rely on the L1 cache. + */ +__device__ static inline +void get_gradiant32( float& grad, + float& theta, + const int x, + const int y, + cudaTextureObject_t layer, + const int level ) +{ + const int idx = threadIdx.x; + + __shared__ float x_array[34]; + + for( int i=idx; i<34; i += blockDim.x ) + { + x_array[i] = readTex( layer, x+i-1.0f, y, level ); + } + __syncthreads(); + + const float dx = x_array[idx+2] - x_array[idx]; + + const float dy = readTex( layer, x+idx, y+1.0f, level ) + - readTex( layer, x+idx, y-1.0f, level ); + + grad = hypotf( dx, dy ); // __fsqrt_rz(dx*dx + dy*dy); + theta = atan2f(dy, dx); +} +#endif + __device__ static inline void get_gradiant( float& grad, float& theta, diff --git a/src/popsift/s_gradiant_cuda9plus.h b/src/popsift/s_gradiant_cuda9plus.h new file mode 100644 index 00000000..6fe24623 --- /dev/null +++ b/src/popsift/s_gradiant_cuda9plus.h @@ -0,0 +1,54 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + +#include + +namespace cg = cooperative_groups; + +namespace popsift +{ + +/* A version of get_gradiant that works for a (32,1,1) threadblock + * and pulls data to shared memory before computing. Data is pulled + * less frequently, meaning that we do not rely on the L1 cache. + */ +__device__ static inline +void get_gradiant32( cg::thread_block_tile<32>& tile, + float& grad, + float& theta, + const int x, + const int y, + cudaTextureObject_t layer, + const int level ) +{ + const int idx = tile.thread_rank(); + + __shared__ float x_array[34]; + + for( int i=idx; i<34; i += tile.size() ) + { + x_array[i] = readTex( layer, x+i-1.0f, y, level ); + } + tile.sync(); + + const float dx = x_array[idx+2] - x_array[idx]; + + const float dy = readTex( layer, x+idx, y+1.0f, level ) + - readTex( layer, x+idx, y-1.0f, level ); + + grad = hypotf( dx, dy ); // __fsqrt_rz(dx*dx + dy*dy); + theta = atan2f(dy, dx); +} + +}; // namespace popsift + +#endif // POPSIFT_HAVE_COOPERATIVE_GROUPS + diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index f6b36fcd..401a897c 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -28,36 +28,14 @@ using namespace popsift; using namespace std; -/* Smoothing like VLFeat is the default mode. - * If you choose to undefine it, you get the smoothing approach taken by OpenCV - */ -#define WITH_VLFEAT_SMOOTHING - namespace popsift { -__device__ -inline float compute_angle( int bin, float hc, float hn, float hp ) -{ - /* interpolate */ - float di = bin + 0.5f * (hn - hp) / (hc+hc-hn-hp); - - /* clamp */ - di = (di < 0) ? - (di + ORI_NBINS) : - ((di >= ORI_NBINS) ? (di - ORI_NBINS) : (di)); - - float th = __fdividef( M_PI2 * di, ORI_NBINS ) - M_PI; - // float th = ((M_PI2 * di) / ORI_NBINS); - return th; -} - /* * Histogram smoothing helper */ -template -__device__ -inline static float smoothe( const float* const src, const int bin ) +__device__ inline static +float smoothe( const float* const src, const int bin ) { const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1; const int next = (bin == ORI_NBINS-1) ? 0 : bin+1; @@ -74,17 +52,19 @@ inline static float smoothe( const float* const src, const int bin ) */ __global__ void ori_par( const int octave, - const int ext_ct_prefix_sum, + ExtremaCounters* ct, + ExtremaBuffers* buf, cudaTextureObject_t layer, const int w, const int h ) { - const int extremum_index = blockIdx.x * blockDim.y; + const int& ext_ct_prefix_sum = ct->getExtremaBase( octave ); + const int extremum_index = blockIdx.x * blockDim.y; - if( popsift::all( extremum_index >= dct.ext_ct[octave] ) ) return; // a few trailing warps + if( popsift::all( extremum_index >= ct->ext_ct[octave] ) ) return; // a few trailing warps - const int iext_off = dobuf.i_ext_off[octave][extremum_index]; - const InitialExtremum* iext = &dobuf.i_ext_dat[octave][iext_off]; + const int iext_off = buf->i_ext_off[octave][extremum_index]; + const InitialExtremum* iext = &buf->i_ext_dat[octave][iext_off]; __shared__ float hist [64]; __shared__ float sm_hist [64]; @@ -95,14 +75,14 @@ void ori_par( const int octave, hist[threadIdx.x+32] = 0.0f; /* keypoint fractional geometry */ - const float x = iext->xpos; - const float y = iext->ypos; - const int level = iext->lpos; // old_level; - const float sig = iext->sigma; + const float& x = iext->getX(); + const float& y = iext->getY(); + const int& level = iext->getLevel(); + const float& sig = iext->getSigma(); /* orientation histogram radius */ const float sigw = ORI_WINFACTOR * sig; - const int32_t rad = (int)roundf((3.0f * sigw)); + const int32_t rad = max( (int)floorf((3.0f * sigw)), 1 ); const float factor = __fdividef( -0.5f, (sigw * sigw) ); const int sq_thres = rad * rad; @@ -111,10 +91,10 @@ void ori_par( const int octave, // int xmax = min(w - 2, (int)floor(x + rad)); // int ymin = max(1, (int)floor(y - rad)); // int ymax = min(h - 2, (int)floor(y + rad)); - int xmin = max(1, (int)roundf(x) - rad); - int xmax = min(w - 2, (int)roundf(x) + rad); - int ymin = max(1, (int)roundf(y) - rad); - int ymax = min(h - 2, (int)roundf(y) + rad); + int xmin = max(0, (int)roundf(x) - rad); + int xmax = min(w - 1, (int)roundf(x) + rad); + int ymin = max(0, (int)roundf(y) - rad); + int ymax = min(h - 1, (int)roundf(y) + rad); int wx = xmax - xmin + 1; int hy = ymax - ymin + 1; @@ -136,25 +116,21 @@ void ori_par( const int octave, layer, level ); + grad /= 2; // our grad is twice that of VLFeat - weird + if( theta < 0 ) theta += M_PI2; + float dx = xx - x; float dy = yy - y; - int sq_dist = dx * dx + dy * dy; - if (sq_dist <= sq_thres) + float sq_dist = dx * dx + dy * dy; + if (sq_dist <= sq_thres + 0.6f) { float weight = grad * expf(sq_dist * factor); - // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) ); int bidx = (int)roundf( __fdividef( float(ORI_NBINS) * (theta + M_PI), M_PI2 ) ); - if( bidx > ORI_NBINS ) { - printf("Crashing: bin %d theta %f :-)\n", bidx, theta); - } - if( bidx < 0 ) { - printf("Crashing: bin %d theta %f :-)\n", bidx, theta); - } - - bidx = (bidx == ORI_NBINS) ? 0 : bidx; + while( bidx < 0 ) bidx += ORI_NBINS; + while( bidx >= ORI_NBINS ) bidx -= ORI_NBINS; atomicAdd( &hist[bidx], weight ); } @@ -162,62 +138,52 @@ void ori_par( const int octave, } __syncthreads(); -#ifdef WITH_VLFEAT_SMOOTHING for( int i=0; i<3 ; i++ ) { - sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 ); - sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 ); + sm_hist[threadIdx.x+ 0] = smoothe( hist, threadIdx.x+ 0 ); + sm_hist[threadIdx.x+32] = smoothe( hist, threadIdx.x+32 ); __syncthreads(); - hist[threadIdx.x+ 0] = smoothe<2>( sm_hist, threadIdx.x+ 0 ); - hist[threadIdx.x+32] = smoothe<3>( sm_hist, threadIdx.x+32 ); + hist[threadIdx.x+ 0] = smoothe( sm_hist, threadIdx.x+ 0 ); + hist[threadIdx.x+32] = smoothe( sm_hist, threadIdx.x+32 ); __syncthreads(); } sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0]; sm_hist[threadIdx.x+32] = hist[threadIdx.x+32]; __syncthreads(); -#else // not WITH_VLFEAT_SMOOTHING - for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) { - int prev2 = bin - 2; - int prev1 = bin - 1; - int next1 = bin + 1; - int next2 = bin + 2; - if( prev2 < 0 ) prev2 += ORI_NBINS; - if( prev1 < 0 ) prev1 += ORI_NBINS; - if( next1 >= ORI_NBINS ) next1 -= ORI_NBINS; - if( next2 >= ORI_NBINS ) next2 -= ORI_NBINS; - sm_hist[bin] = ( hist[prev2] + hist[next2] - + ( hist[prev1] + hist[next1] ) * 4.0f - + hist[bin] * 6.0f ) / 16.0f; - } - __syncthreads(); -#endif // not WITH_VLFEAT_SMOOTHING + + if( threadIdx.x+32 >= ORI_NBINS ) sm_hist[threadIdx.x+32] = -INFINITY; + float maxval = fmaxf( sm_hist[threadIdx.x+ 0], sm_hist[threadIdx.x+32] ); + float neigh; + neigh = popsift::shuffle_down( maxval, 16 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 8 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 4 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 2 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 1 ); maxval = fmaxf( maxval, neigh ); + maxval = popsift::shuffle( maxval, 0 ); // sub-cell refinement of the histogram cell index, yielding the angle // not necessary to initialize, every cell is computed - for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) { - const int prev = bin == 0 ? ORI_NBINS-1 : bin-1; - const int next = bin == ORI_NBINS-1 ? 0 : bin+1; + for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) + { + const int prev = ( bin - 1 + ORI_NBINS ) % ORI_NBINS; + const int next = ( bin + 1 ) % ORI_NBINS; - bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ); + bool predicate = ( bin < ORI_NBINS ) && + ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ) && + ( sm_hist[bin] > 0.8f * maxval ); const float num = predicate ? 3.0f * sm_hist[prev] - 4.0f * sm_hist[bin] + 1.0f * sm_hist[next] : 0.0f; - // const float num = predicate ? 2.0f * sm_hist[prev] - // - 4.0f * sm_hist[bin] - // + 2.0f * sm_hist[next] - // : 0.0f; const float denB = predicate ? 2.0f * ( sm_hist[prev] - 2.0f * sm_hist[bin] + sm_hist[next] ) : 1.0f; const float newbin = __fdividef( num, denB ); // verified: accuracy OK - predicate = ( predicate && newbin >= 0.0f && newbin <= 2.0f ); - refined_angle[bin] = predicate ? prev + newbin : -1; - yval[bin] = predicate ? -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY; + yval[bin] = predicate ? -(num*num) / (4.0f * denB) + hist[prev] : -INFINITY; } __syncthreads(); @@ -229,19 +195,19 @@ void ori_par( const int octave, // All threads retrieve the yval of thread 0, the largest // of all yvals. - const float best_val = yval[best_index.x]; - const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 ); - const bool valid = ( best_val >= yval_ref ); + const bool valid = ( yval[best_index.x] > 0 ); bool written = false; - Extremum* ext = &dobuf.extrema[ext_ct_prefix_sum + extremum_index]; + Extremum* ext = &buf->extrema[ext_ct_prefix_sum + extremum_index]; if( threadIdx.x < ORIENTATION_MAX_COUNT ) { if( valid ) { float chosen_bin = refined_angle[best_index.x]; if( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS; - // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI; - float th = ::fmaf( M_PI2 * chosen_bin, 1.0f/ORI_NBINS, - M_PI ); + if( chosen_bin < 0 ) chosen_bin += ORI_NBINS; + float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS); // - M_PI; + th -= M_PI; + if( th < 0.0f ) th += M_PI2; ext->orientation[threadIdx.x] = th; written = true; } @@ -249,11 +215,11 @@ void ori_par( const int octave, int angles = __popc( popsift::ballot( written ) ); if( threadIdx.x == 0 ) { - ext->xpos = iext->xpos; - ext->ypos = iext->ypos; - ext->lpos = iext->lpos; - ext->sigma = iext->sigma; - ext->octave = octave; + ext->xpos = iext->getX(); + ext->ypos = iext->getY(); + ext->lpos = iext->getLevel(); + ext->sigma = iext->getSigma(); + ext->octave = iext->getOctave(); ext->num_ori = angles; } } @@ -318,78 +284,52 @@ public: }; __global__ -void ori_prefix_sum( const int total_ext_ct, const int num_octaves ) +void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf ) { int total_ori = 0; - Extremum* extremum = dobuf.extrema; - int* feat_to_ext_map = dobuf.feat_to_ext_map; + Extremum* extremum = buf->extrema; + int* feat_to_ext_map = buf->feat_to_ext_map; ExtremaRead r( extremum ); ExtremaWrt w( extremum ); ExtremaTot t( total_ori ); - ExtremaWrtMap wrtm( feat_to_ext_map, max( d_consts.max_orientations, dbuf.ori_allocated ) ); - ExclusivePrefixSum::Block( total_ext_ct, r, w, t, wrtm ); + ExtremaWrtMap wrtm( feat_to_ext_map, max( d_consts.max_orientations, buf->ori_allocated ) ); + ExclusivePrefixSum::Block( ct->getTotalExtrema(), r, w, t, wrtm ); __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { - dct.ext_ps[0] = 0; - for( int o=1; omake_extrema_prefix_sums( ); for( int o=0; oext_ct[o] == 0 ) { + ct->ori_ct[o] = 0; } else { - int fe = dct.ext_ps[o ]; /* first extremum for this octave */ - int le = dct.ext_ps[o+1]-1; /* last extremum for this octave */ - int lo_ori_index = dobuf.extrema[fe].idx_ori; - int num_ori = dobuf.extrema[le].num_ori; - int hi_ori_index = dobuf.extrema[le].idx_ori + num_ori; - dct.ori_ct[o] = hi_ori_index - lo_ori_index; + int fe = ct->getExtremaBase(o ); /* first extremum for this octave */ + int le = ct->getExtremaBase(o+1)-1; /* last extremum for this octave */ + int lo_ori_index = buf->extrema[fe].idx_ori; + int num_ori = buf->extrema[le].num_ori; + int hi_ori_index = buf->extrema[le].idx_ori + num_ori; + ct->ori_ct[o] = hi_ori_index - lo_ori_index; } } - dct.ori_ps[0] = 0; - for( int o=1; omake_orientation_prefix_sums( ); } } __host__ void Pyramid::orientation( const Config& conf ) { - readDescCountersFromDevice( ); - - int ext_total = 0; - for(int o : hct.ext_ct) - { - if( o > 0 ) - { - ext_total += o; - } - } - // Filter functions are only called if necessary. They are very expensive, // therefore add 10% slack. - if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < ext_total ) + if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < _ct->getTotalExtrema() ) { - ext_total = extrema_filter_grid( conf, ext_total ); + _fg.filter( conf, _ct, _buf ); } - reallocExtrema( ext_total ); - - int ext_ct_prefix_sum = 0; - for( int octave=0; octave<_num_octaves; octave++ ) { - hct.ext_ps[octave] = ext_ct_prefix_sum; - ext_ct_prefix_sum += hct.ext_ct[octave]; - } - hct.ext_total = ext_ct_prefix_sum; + reallocExtrema( _ct->getTotalExtrema() ); cudaStream_t oct_0_str = _octaves[0].getStream(); @@ -400,7 +340,7 @@ void Pyramid::orientation( const Config& conf ) cudaStream_t oct_str = oct_obj.getStream(); - int num = hct.ext_ct[octave]; + int num = _ct->ext_ct[octave]; if( num > 0 ) { dim3 block; @@ -413,7 +353,8 @@ void Pyramid::orientation( const Config& conf ) ori_par <<>> ( octave, - hct.ext_ps[octave], + _ct, + _buf, oct_obj.getDataTexPoint( ), oct_obj.getWidth( ), oct_obj.getHeight( ) ); @@ -434,7 +375,7 @@ void Pyramid::orientation( const Config& conf ) grid.x = 1; ori_prefix_sum <<>> - ( ext_ct_prefix_sum, _num_octaves ); + ( _ct, _buf ); POP_SYNC_CHK; cudaDeviceSynchronize(); diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu index 8873ca5c..c8a3a3ee 100755 --- a/src/popsift/s_pyramid_build.cu +++ b/src/popsift/s_pyramid_build.cu @@ -279,7 +279,7 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t grid.x = grid_divide( width, 32 ); grid.y = grid_divide( height, block.y ); - gauss::absoluteSource::horiz + as_horiz <<>> ( oct_obj.getDataTexPoint( ), oct_obj.getIntermediateSurface( ), @@ -345,7 +345,7 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea grid.x = (unsigned int)grid_divide( width, block.x ); grid.y = (unsigned int)grid_divide( height, block.y ); - gauss::absoluteSource::vert + as_vert <<>> ( oct_obj.getIntermDataTexPoint( ), oct_obj.getDataSurface( ), diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu index c026a8b7..bc8382ed 100755 --- a/src/popsift/s_pyramid_build_aa.cu +++ b/src/popsift/s_pyramid_build_aa.cu @@ -10,24 +10,20 @@ #include "s_pyramid_build_aa.h" #include "sift_constants.h" -namespace popsift { -namespace gauss { -namespace absoluteSource { - -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void as_horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int src_level = dst_level - 1; - const int span = d_gauss.inc.span[dst_level]; - const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; + const int span = popsift::d_gauss.inc.span[dst_level]; + const float* filter = &popsift::d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; const int off_x = blockIdx.x * blockDim.x + threadIdx.x; const int off_y = blockIdx.y * blockDim.y + threadIdx.y; float out = 0.0f; - float A = readTex( src_point_texture, off_x - span, off_y, src_level ); - float B = readTex( src_point_texture, off_x + span, off_y, src_level ); - float C = readTex( src_point_texture, off_x , off_y, src_level ); + float A = popsift::readTex( src_point_texture, off_x - span, off_y, src_level ); + float B = popsift::readTex( src_point_texture, off_x + span, off_y, src_level ); + float C = popsift::readTex( src_point_texture, off_x , off_y, src_level ); float g = filter[0]; out += C * g; g = filter[span]; @@ -49,10 +45,10 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero ); } -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void as_vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { - const int span = d_gauss.inc.span[dst_level]; - const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; + const int span = popsift::d_gauss.inc.span[dst_level]; + const float* filter = &popsift::d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; int block_x = blockIdx.x * blockDim.x; int block_y = blockIdx.y * blockDim.y; int idx = threadIdx.x; @@ -66,17 +62,17 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t g = filter[offset]; idy = threadIdx.y - offset; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); idy = threadIdx.y + offset; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); } g = filter[0]; idy = threadIdx.y; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); idx = block_x+threadIdx.x; @@ -85,6 +81,10 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero ); } +namespace popsift { +namespace gauss { +namespace absoluteSource { + __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.abs_o0.span[dst_level]; diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h index 4d3423cf..c09d0ef2 100755 --- a/src/popsift/s_pyramid_build_aa.h +++ b/src/popsift/s_pyramid_build_aa.h @@ -7,14 +7,14 @@ */ #include "common/plane_2d.h" +__global__ void as_horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); + +__global__ void as_vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); + namespace popsift { namespace gauss { namespace absoluteSource { -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); - -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); - __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); __global__ void vert_all_abs0(cudaTextureObject_t src_point_texture, diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index 251f58ff..8b944156 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -9,9 +9,18 @@ #include "sift_conf.h" #include +#include +#include using namespace std; +static bool stringIsame( string l, string r ) +{ + std::for_each( l.begin(), l.end(), [](char& c) { c = ::tolower(c); }); + std::for_each( r.begin(), r.end(), [](char& c) { c = ::tolower(c); }); + return l == r; +} + namespace popsift { @@ -20,19 +29,16 @@ Config::Config( ) , octaves( -1 ) , levels( 3 ) , sigma( 1.6f ) - , _edge_limit( 10.0f ) - , _threshold( 0.04 ) // ( 10.0f / 256.0f ) + , _edge_limit( getEdgeThreshDefault( ) ) + , _threshold( getPeakThreshDefault() ) , _gauss_mode( getGaussModeDefault() ) , _sift_mode( Config::PopSift ) , _log_mode( Config::None ) , _scaling_mode( Config::ScaleDefault ) , _desc_mode( Config::Loop ) - , _grid_filter_mode( Config::RandomScale ) , verbose( false ) // , _max_extrema( 20000 ) , _max_extrema( 100000 ) - , _filter_max_extrema( -1 ) - , _filter_grid_size( 2 ) , _assume_initial_blur( true ) , _initial_blur( 0.5f ) , _normalization_mode( getNormModeDefault() ) @@ -72,6 +78,8 @@ void Config::setDescMode( const std::string& text ) setDescMode( Config::IGrid ); else if( text == "notile" ) setDescMode( Config::NoTile ); + else if( text == "vlfeat" ) + setDescMode( Config::VLFeat_Desc ); else POP_FATAL( "specified descriptor extraction mode must be one of loop, grid or igrid" ); } @@ -81,6 +89,25 @@ void Config::setDescMode( Config::DescMode m ) _desc_mode = m; } +const char* Config::getDescModeUsage( ) +{ + return "Choice of descriptor extraction modes:\n" + "loop, iloop, grid, igrid, notile, vlfeat\n" + "Default is loop\n" + "loop is OpenCV-like horizontal scanning, sampling every pixel in a radius around the " + "centers or the 16 tiles arond the keypoint. Each sampled point contributes to two " + "histogram bins." + "iloop is like loop but samples all constant 1-pixel distances from the keypoint, " + "using the CUDA texture engine for interpolation. " + "grid is like loop but works on rotated, normalized tiles, relying on CUDA 2D cache " + "to replace the manual data aligment idea of loop. " + "igrid iloop and grid. " + "notile is like igrid but handles all 16 tiles at once.\n" + "vlfeat is VLFeat-like horizontal scanning, sampling every pixel in a radius around " + "keypoint itself, using the 16 tile centers only for weighting. Every sampled point " + "contributes to up to eight historgram bins."; +} + void Config::setGaussMode( const std::string& m ) { if( m == "vlfeat" ) @@ -120,6 +147,16 @@ const char* Config::getGaussModeUsage( ) "relative (synonym for vlfeat-hw-interpolated)"; } +void Config::setFilterMaxExtrema( int ext ) +{ + _grid_filter._max_extrema = ext; +} + +void Config::setFilterGridSize( int sz ) +{ + _grid_filter._size = sz; +} + bool Config::getCanFilterExtrema() const { #if __CUDACC_VER_MAJOR__ >= 8 @@ -129,21 +166,36 @@ bool Config::getCanFilterExtrema() const #endif } +const char* Config::getFilterGridModeUsage( ) +{ + return "l/largest (default) or s/smallest. " + "largest: keep the extrema with the largest scales from each grid-filtered cell. " + "smallest: keep the extrema with the smallest scales. " + "The first octave has the smallest scales."; +} + +GridFilterConfig::Mode Config::getFilterGridModeDefault( ) +{ + return GridFilterConfig::LargestScaleFirst; +} + void Config::setFilterSorting( const std::string& text ) { - if( text == "up" ) - _grid_filter_mode = Config::SmallestScaleFirst; - else if( text == "down" ) - _grid_filter_mode = Config::LargestScaleFirst; - else if( text == "random" ) - _grid_filter_mode = Config::RandomScale; + if( stringIsame( text, "smallest" ) ) + _grid_filter._mode = GridFilterConfig::SmallestScaleFirst; + if( stringIsame( text, "s" ) ) + _grid_filter._mode = GridFilterConfig::SmallestScaleFirst; + else if( stringIsame( text, "largest" ) ) + _grid_filter._mode = GridFilterConfig::LargestScaleFirst; + else if( stringIsame( text, "l" ) ) + _grid_filter._mode = GridFilterConfig::LargestScaleFirst; else - POP_FATAL( "filter sorting mode must be one of up, down or random" ); + POP_FATAL( string("bad grid filter mode, ") + getFilterGridModeUsage() ); } -void Config::setFilterSorting( Config::GridFilterMode m ) +void Config::setFilterSorting( GridFilterConfig::Mode m ) { - _grid_filter_mode = m; + _grid_filter._mode = m; } void Config::setVerbose( bool on ) @@ -196,15 +248,25 @@ void Config::setNormMode( Config::NormMode m ) void Config::setNormMode( const std::string& m ) { - if( m == "RootSift" ) setNormMode( Config::RootSift ); - else if( m == "classic" ) setNormMode( Config::Classic ); + if( stringIsame( m, "RootSift" ) ) + { + setNormMode( Config::RootSift ); + } + else if( stringIsame( m, "L2" ) ) + { + setNormMode( Config::Classic ); + } + else if( stringIsame( m, "Classic" ) ) + { + setNormMode( Config::Classic ); + } else POP_FATAL( string("Bad Normalization mode.\n") + getGaussModeUsage() ); } Config::NormMode Config::getNormModeDefault( ) { - return Config::RootSift; + return Config::NormDefault; } const char* Config::getNormModeUsage( ) @@ -232,15 +294,54 @@ int Config::getNormalizationMultiplier( ) const return _normalization_multiplier; } -void Config::setDownsampling( float v ) { _upscale_factor = -v; } +void Config::setDownsampling( float v ) { _upscale_factor = -v; } +float Config::getDownsampling( ) const { return -_upscale_factor; } + void Config::setOctaves( int v ) { octaves = v; } +int Config::getOctaves( ) const { return octaves; } + void Config::setLevels( int v ) { levels = v; } -void Config::setSigma( float v ) { sigma = v; } -void Config::setEdgeLimit( float v ) { _edge_limit = v; } -void Config::setThreshold( float v ) { _threshold = v; } +int Config::getLevels( ) const { return levels; } + +void Config::setSigma( float v ) { sigma = v; } +float Config::getSigma( ) const { return sigma; } + +void Config::setEdgeLimit( float v ) { _edge_limit = v; } +float Config::getEdgeLimit( ) const { return _edge_limit; } +float Config::getEdgeThreshDefault( ) +{ + return 10.0f; +} +std::string Config::getEdgeThreshUsage( ) +{ + std::ostringstream ostr; + ostr << "Edge Threshold: eliminates peaks of the DoG scale space whose curvature is too small." << endl + << "Default: " << getEdgeThreshDefault() << endl + << "Set to a value <= 0 to disable." << endl; + return ostr.str(); +} + +void Config::setThreshold( float v ) { _threshold = v; } +float Config::getThreshold( ) const { return _threshold; } +float Config::getPeakThreshold() const +{ + return ( _threshold * 0.5f * 255.0f / levels ); +} +float Config::getPeakThreshDefault( ) +{ + return 0.04f; // ( 10.0f / 256.0f ) +} +std::string Config::getPeakThreshUsage( ) +{ + std::ostringstream ostr; + ostr << "Peak Threshold: eliminates peaks of the DoG scale space that are too small" < +#include "sift_config.h" + #ifndef INF #define INF (1<<29) #endif diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index b0eb0bd1..ecda3571 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -13,6 +13,7 @@ #include "s_desc_loop.h" #include "s_desc_normalize.h" #include "s_desc_notile.h" +#include "s_desc_vlfeat.h" #include "s_gradiant.h" #include "sift_config.h" #include "sift_constants.h" @@ -64,19 +65,21 @@ void Pyramid::descriptors( const Config& conf ) for( int octave=_num_octaves-1; octave>=0; octave-- ) // for( int octave=0; octave<_num_octaves; octave++ ) { - if( hct.ori_ct[octave] != 0 ) { + if( _ct->ori_ct[octave] != 0 ) { Octave& oct_obj = _octaves[octave]; if( conf.getDescMode() == Config::Loop ) { - start_ext_desc_loop( octave, oct_obj ); + start_ext_desc_loop( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::ILoop ) { - start_ext_desc_iloop( octave, oct_obj ); + start_ext_desc_iloop( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::Grid ) { - start_ext_desc_grid( octave, oct_obj ); + start_ext_desc_grid( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::IGrid ) { - start_ext_desc_igrid( octave, oct_obj ); + start_ext_desc_igrid( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::NoTile ) { - start_ext_desc_notile( octave, oct_obj ); + start_ext_desc_notile( _ct, _buf, octave, oct_obj ); + } else if( conf.getDescMode() == Config::VLFeat_Desc ) { + start_ext_desc_vlfeat( _ct, _buf, octave, oct_obj ); } else { POP_FATAL( "not yet" ); } @@ -85,24 +88,31 @@ void Pyramid::descriptors( const Config& conf ) } } - if( hct.ori_total == 0 ) + if( _ct->getTotalOrientations() == 0 ) { cerr << "Warning: no descriptors extracted" << endl; - return; + return; } dim3 block; - dim3 grid; - grid.x = grid_divide( hct.ori_total, 32 ); block.x = 32; block.y = 32; block.z = 1; + dim3 grid; + grid.x = grid_divide( _ct->getTotalOrientations(), block.y ); + if( conf.getUseRootSift() ) { - normalize_histogram <<>> ( ); + normalize_histogram + <<>> + ( _buf->desc, + _ct->getTotalOrientations() ); POP_SYNC_CHK; } else { - normalize_histogram <<>> ( ); + normalize_histogram + <<>> + ( _buf->desc, + _ct->getTotalOrientations() ); POP_SYNC_CHK; } diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h index 0363a02e..138ef865 100755 --- a/src/popsift/sift_extremum.h +++ b/src/popsift/sift_extremum.h @@ -9,35 +9,11 @@ #include "sift_constants.h" -#include -#include +// #include +// #include namespace popsift { -/** - * @brief This is an internal data structure. - * Separated from the final Extremum data structure to implement - * grid filtering in a space-efficient manner. In grid filtering, - * extrema are first found, after that some may be discarded in - * some spatial regions of the image. Avoid waste of space by - * allocating Extremum structures only for the remaining ones. - */ -struct InitialExtremum -{ - float xpos; - float ypos; - /// extremum refined into this level - int lpos; - /// scale - float sigma; - /// index into the grid for grid-based extrema filtering - int cell; - /// true if this extremum has been filtered - bool ignore; - /// if any initial extrema are ignored, new index for Extremum - int write_index; -}; - /** * @brief This is an internal data structure. * For performance reasons, it would be appropriate to split diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index 06060052..9547028a 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -38,16 +38,6 @@ using namespace std; namespace popsift { -__device__ ExtremaCounters dct; -thread_local ExtremaCounters hct; - -__device__ ExtremaBuffers dbuf; -thread_local ExtremaBuffers dbuf_shadow; // just for managing memories -thread_local ExtremaBuffers hbuf; - -__device__ DevBuffers dobuf; -thread_local DevBuffers dobuf_shadow; // just for managing memories - __global__ void py_print_corner_float(float* img, uint32_t pitch, uint32_t height, uint32_t level) { @@ -112,17 +102,15 @@ Pyramid::Pyramid( const Config& config, , _levels( config.levels + 3 ) , _assume_initial_blur( config.hasInitialBlur() ) , _initial_blur( config.getInitialBlur() ) + , _fg( config ) { _octaves = new Octave[_num_octaves]; int w = width; int h = height; - memset( &hct, 0, sizeof(ExtremaCounters) ); - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); - - memset( &hbuf, 0, sizeof(ExtremaBuffers) ); - memset( &dbuf_shadow, 0, sizeof(ExtremaBuffers) ); + _ct = popsift::cuda::malloc_mgdT( 1, __FILE__, __LINE__ ); + memset( _ct, 0, sizeof(ExtremaCounters) ); _d_extrema_num_blocks = popsift::cuda::malloc_devT( _num_octaves, __FILE__, __LINE__ ); @@ -133,33 +121,8 @@ Pyramid::Pyramid( const Config& config, h = ceilf(h / 2.0f); } - int sz = _num_octaves * h_consts.max_extrema; - dobuf_shadow.i_ext_dat[0] = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.i_ext_off[0] = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - for (int o = 1; o<_num_octaves; o++) { - dobuf_shadow.i_ext_dat[o] = dobuf_shadow.i_ext_dat[0] + (o*h_consts.max_extrema); - dobuf_shadow.i_ext_off[o] = dobuf_shadow.i_ext_off[0] + (o*h_consts.max_extrema); - } - for (int o = _num_octaves; o( sz, __FILE__, __LINE__); - dobuf_shadow.features = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ext_allocated = sz; - dbuf_shadow.ext_allocated = sz; - - sz = max( 2 * h_consts.max_extrema, h_consts.max_orientations ); - hbuf .desc = popsift::cuda::malloc_hstT( sz, __FILE__, __LINE__); - dbuf_shadow.desc = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.feat_to_ext_map = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ori_allocated = sz; - dbuf_shadow.ori_allocated = sz; - - cudaMemcpyToSymbol( dbuf, &dbuf_shadow, sizeof(ExtremaBuffers), 0, cudaMemcpyHostToDevice ); - cudaMemcpyToSymbol( dobuf, &dobuf_shadow, sizeof(DevBuffers), 0, cudaMemcpyHostToDevice ); + _buf = popsift::cuda::malloc_mgdT( 1, __FILE__, __LINE__ ); + _buf->init( _num_octaves, h_consts.max_extrema, h_consts.max_orientations ); cudaStreamCreate( &_download_stream ); } @@ -178,48 +141,19 @@ void Pyramid::resetDimensions( const Config& conf, int width, int height ) void Pyramid::reallocExtrema( int numExtrema ) { - if( numExtrema > hbuf.ext_allocated ) { - numExtrema = ( ( numExtrema + 1024 ) & ( ~(1024-1) ) ); - cudaFree( dobuf_shadow.extrema ); - cudaFree( dobuf_shadow.features ); - - int sz = numExtrema; - dobuf_shadow.extrema = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.features = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ext_allocated = sz; - dbuf_shadow.ext_allocated = sz; - - numExtrema *= 2; - if( numExtrema > hbuf.ori_allocated ) { - cudaFreeHost( hbuf .desc ); - cudaFree( dbuf_shadow.desc ); - cudaFree( dobuf_shadow.feat_to_ext_map ); - - sz = numExtrema; - hbuf .desc = popsift::cuda::malloc_hstT( sz, __FILE__, __LINE__); - dbuf_shadow.desc = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.feat_to_ext_map = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ori_allocated = sz; - dbuf_shadow.ori_allocated = sz; - } - - cudaMemcpyToSymbol( dbuf, &dbuf_shadow, sizeof(ExtremaBuffers), 0, cudaMemcpyHostToDevice ); - cudaMemcpyToSymbol( dobuf, &dobuf_shadow, sizeof(DevBuffers), 0, cudaMemcpyHostToDevice ); - } + _buf->growBuffers( numExtrema ); } Pyramid::~Pyramid() { cudaStreamDestroy( _download_stream ); - cudaFree( _d_extrema_num_blocks ); - cudaFree( dobuf_shadow.i_ext_dat[0] ); - cudaFree( dobuf_shadow.i_ext_off[0] ); - cudaFree( dobuf_shadow.features ); - cudaFree( dobuf_shadow.extrema ); - cudaFreeHost( hbuf .desc ); - cudaFree( dbuf_shadow .desc ); - cudaFree( dobuf_shadow.feat_to_ext_map ); + _buf->uninit( ); + cudaFree( _buf ); + + cudaFree( _d_extrema_num_blocks ); + + cudaFree( _ct ); delete[] _octaves; } @@ -234,6 +168,8 @@ void Pyramid::step2( const Config& conf ) { find_extrema( conf ); + finalize_extrema( ); + orientation( conf ); descriptors( conf ); @@ -248,12 +184,12 @@ void Pyramid::step2( const Config& conf ) * GPUs are compatible. */ __global__ -void prep_features( Descriptor* descriptor_base, int up_fac ) +void prep_features( Descriptor* descriptor_base, ExtremaCounters* ct, ExtremaBuffers* buf, int up_fac ) { int offset = blockIdx.x * 32 + threadIdx.x; - if( offset >= dct.ext_total ) return; - const Extremum& ext = dobuf.extrema [offset]; - Feature& fet = dobuf.features[offset]; + if( offset >= ct->getTotalExtrema() ) return; + const Extremum& ext = buf->extrema [offset]; + Feature& fet = buf->features[offset]; const int octave = ext.octave; const float xpos = ext.xpos * powf(2.0f, float(octave - up_fac)); @@ -286,30 +222,30 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) readDescCountersFromDevice(); nvtxRangePushA( "download descriptors" ); - FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total ); + FeaturesHost* features = new FeaturesHost( _ct->getTotalExtrema(), _ct->getTotalOrientations() ); - if( hct.ext_total == 0 || hct.ori_total == 0 ) + if( _ct->getTotalExtrema() == 0 || _ct->getTotalOrientations() == 0 ) { nvtxRangePop(); return features; } - dim3 grid( grid_divide( hct.ext_total, 32 ) ); - prep_features<<>>( features->getDescriptors(), up_fac ); + dim3 grid( grid_divide( _ct->getTotalExtrema(), 32 ) ); + prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; nvtxRangePushA( "register host memory" ); features->pin( ); nvtxRangePop(); popcuda_memcpy_async( features->getFeatures(), - dobuf_shadow.features, - hct.ext_total * sizeof(Feature), + _buf->features, + _ct->getTotalExtrema() * sizeof(Feature), cudaMemcpyDeviceToHost, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), - dbuf_shadow.desc, - hct.ori_total * sizeof(Descriptor), + _buf->desc, + _ct->getTotalOrientations() * sizeof(Descriptor), cudaMemcpyDeviceToHost, _download_stream ); cudaStreamSynchronize( _download_stream ); @@ -325,25 +261,25 @@ void Pyramid::clone_device_descriptors_sub( const Config& conf, FeaturesDev* fea { const float up_fac = conf.getUpscaleFactor(); - dim3 grid( grid_divide( hct.ext_total, 32 ) ); - prep_features<<>>( features->getDescriptors(), up_fac ); + dim3 grid( grid_divide( _ct->getTotalExtrema(), 32 ) ); + prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; popcuda_memcpy_async( features->getFeatures(), - dobuf_shadow.features, - hct.ext_total * sizeof(Feature), + _buf->features, + _ct->getTotalExtrema() * sizeof(Feature), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), - dbuf_shadow.desc, - hct.ori_total * sizeof(Descriptor), + _buf->desc, + _ct->getTotalOrientations() * sizeof(Descriptor), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getReverseMap(), - dobuf_shadow.feat_to_ext_map, - hct.ori_total * sizeof(int), + _buf->feat_to_ext_map, + _ct->getTotalOrientations() * sizeof(int), cudaMemcpyDeviceToDevice, _download_stream ); } @@ -352,7 +288,7 @@ FeaturesDev* Pyramid::clone_device_descriptors( const Config& conf ) { readDescCountersFromDevice(); - FeaturesDev* features = new FeaturesDev( hct.ext_total, hct.ori_total ); + FeaturesDev* features = new FeaturesDev( _ct->getTotalExtrema(), _ct->getTotalOrientations() ); clone_device_descriptors_sub( conf, features ); @@ -363,8 +299,7 @@ FeaturesDev* Pyramid::clone_device_descriptors( const Config& conf ) void Pyramid::reset_extrema_mgmt() { - memset( &hct, 0, sizeof(ExtremaCounters) ); - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + memset( _ct, 0, sizeof(ExtremaCounters) ); popcuda_memset_sync( _d_extrema_num_blocks, 0, _num_octaves * sizeof(int) ); @@ -372,22 +307,24 @@ void Pyramid::reset_extrema_mgmt() void Pyramid::readDescCountersFromDevice( ) { - cudaMemcpyFromSymbol( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost ); + // cudaMemcpyFromSymbol( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost ); + // we must not copy mgmt memory explicitly, just wait for the device driver + cudaDeviceSynchronize(); } void Pyramid::readDescCountersFromDevice( cudaStream_t s ) { - cudaMemcpyFromSymbolAsync( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost, s ); + cudaStreamSynchronize( s ); } void Pyramid::writeDescCountersToDevice( ) { - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + cudaDeviceSynchronize(); } void Pyramid::writeDescCountersToDevice( cudaStream_t s ) { - cudaMemcpyToSymbolAsync( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice, s ); + cudaStreamSynchronize( s ); } int* Pyramid::getNumberOfBlocks( int octave ) @@ -404,7 +341,7 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* const float up_fac = conf.getUpscaleFactor(); - for( int ext_idx = 0; ext_idxgetTotalExtrema(); ext_idx++ ) { const Feature& ext = features->getFeatures()[ext_idx]; const int octave = ext.debug_octave; const float xpos = ext.xpos * pow(2.0f, octave - up_fac); @@ -417,7 +354,7 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* dom_ori = dom_ori / M_PI2 * 360; if (dom_ori < 0) dom_ori += 360; - const Descriptor& desc = *ext.desc[ori]; // hbuf.desc[ori_idx]; + const Descriptor& desc = *ext.desc[ori]; if( with_orientation ) ostr << setprecision(5) @@ -443,5 +380,60 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* } } +void ExtremaBuffers::init( int num_octaves, int max_extrema, int max_orientations ) +{ + memset( this, 0, sizeof(ExtremaBuffers) ); + + const int sz = num_octaves * max_extrema; + i_ext_dat[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + i_ext_off[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + for (int o = 1; o < num_octaves; o++) { + i_ext_dat[o] = i_ext_dat[0] + (o*max_extrema); + i_ext_off[o] = i_ext_off[0] + (o*max_extrema); + } + + extrema = popsift::cuda::malloc_mgdT( max_extrema, __FILE__, __LINE__); + features = popsift::cuda::malloc_mgdT ( max_extrema, __FILE__, __LINE__); + ext_allocated = max_extrema; + + const int osz = max( 2 * max_extrema, max_orientations ); + desc = popsift::cuda::malloc_mgdT( osz, __FILE__, __LINE__); + feat_to_ext_map = popsift::cuda::malloc_mgdT ( osz, __FILE__, __LINE__); + ori_allocated = osz; +} + +void ExtremaBuffers::uninit( ) +{ + cudaFree( feat_to_ext_map ); + cudaFree( desc ); + cudaFree( features ); + cudaFree( extrema ); + cudaFree( i_ext_off[0] ); + cudaFree( i_ext_dat[0] ); +} + +void ExtremaBuffers::growBuffers( int numExtrema ) +{ + numExtrema = ( ( numExtrema + 1024 ) & ( ~(1024-1) ) ); + + if( numExtrema <= ext_allocated ) return; + + cudaFree( extrema ); + cudaFree( features ); + + extrema = popsift::cuda::malloc_mgdT( numExtrema, __FILE__, __LINE__); + features = popsift::cuda::malloc_mgdT ( numExtrema, __FILE__, __LINE__); + ext_allocated = numExtrema; + + numExtrema *= 2; + if( numExtrema > ori_allocated ) { + cudaFree( desc ); + cudaFree( feat_to_ext_map ); + + desc = popsift::cuda::malloc_mgdT ( numExtrema, __FILE__, __LINE__); + feat_to_ext_map = popsift::cuda::malloc_mgdT( numExtrema, __FILE__, __LINE__); + ori_allocated = numExtrema; + } +} } // namespace popsift diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 837fc3b1..24045114 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -8,16 +8,21 @@ #pragma once #include "features.h" +#include "filtergrid.h" #include "s_image.h" #include "sift_conf.h" #include "sift_constants.h" #include "sift_octave.h" +#include "initial_extremum.h" #include #include namespace popsift { +/** Datastructure in managed memory that holds counters for + * initially collected extrema and orientations. + */ struct ExtremaCounters { /* The number of extrema found per octave */ @@ -25,38 +30,114 @@ struct ExtremaCounters /* The number of orientation found per octave */ int ori_ct[MAX_OCTAVES]; +private: /* Exclusive prefix sum of ext_ct */ - int ext_ps[MAX_OCTAVES]; + int ext_prefix_sum[MAX_OCTAVES+1]; /* Exclusive prefix sum of ori_ct */ - int ori_ps[MAX_OCTAVES]; + int ori_prefix_sum[MAX_OCTAVES+1]; - int ext_total; - int ori_total; +public: + /** host and device function helper function that updates the exclusive + * prefix sum for extrema counts, updates ext_total and returns the total + * number of extrema. + * Note that exclusive prefix sum on the device could use several threads + * but we are talking about a constant size of 20. + */ + __device__ __host__ inline + int make_extrema_prefix_sums( ) + { + ext_prefix_sum[0] = 0; + for( int o=1; o<=MAX_OCTAVES; o++ ) { + ext_prefix_sum[o] = ext_prefix_sum[o-1] + ext_ct[o-1]; + } + + return ext_prefix_sum[MAX_OCTAVES]; + } + + /** get total number of extrema */ + __device__ __host__ inline + int getTotalExtrema( ) const + { + return ext_prefix_sum[MAX_OCTAVES]; + } + + __device__ __host__ inline + int getExtremaCount( int octave ) const + { + return ext_ct[octave]; + } + + /** in a sorted array of extrema, get the base index for the entries + * of this octave's extrema */ + __device__ __host__ inline + int getExtremaBase( const int& octave ) const + { + return ext_prefix_sum[octave]; + } + + /** compute the prefix sum and total sum of orientation count per octave */ + __device__ __host__ inline + int make_orientation_prefix_sums( ) + { + ori_prefix_sum[0] = 0; + for( int o=1; o<=MAX_OCTAVES; o++ ) { + ori_prefix_sum[o] = ori_prefix_sum[o-1] + ori_ct[o-1]; + } + + return ori_prefix_sum[MAX_OCTAVES]; + } + + /** get total number of orientations */ + __device__ __host__ inline + int getTotalOrientations( ) const + { + return ori_prefix_sum[MAX_OCTAVES]; + } + + /** in a sorted array of orientations, get the base index for the entries + * of this octave's orientations */ + __device__ __host__ inline + int getOrientationBase( const int& octave ) const + { + return ori_prefix_sum[octave]; + } }; +/** Datastructure in managed memory that allows CPU and GPU to access all + * buffers. + */ struct ExtremaBuffers { + /* This part of the struct deals with the descriptors that are + * finally detected by the algorithm. + */ Descriptor* desc; int ext_allocated; int ori_allocated; -}; -struct DevBuffers -{ + /* This part of the struct deals with intermediate buffers to find + * extrema. + */ InitialExtremum* i_ext_dat[MAX_OCTAVES]; int* i_ext_off[MAX_OCTAVES]; int* feat_to_ext_map; Extremum* extrema; Feature* features; -}; -extern thread_local ExtremaCounters hct; -extern __device__ ExtremaCounters dct; -extern thread_local ExtremaBuffers hbuf; -extern __device__ ExtremaBuffers dbuf; -extern thread_local ExtremaBuffers dbuf_shadow; // just for managing memories -extern __device__ DevBuffers dobuf; -extern thread_local DevBuffers dobuf_shadow; // just for managing memories + /** Allocate buffers that hold intermediate and final information about + * extrema and if applicable, orientations and descriptors. + * Buffers are located in managed memory. + */ + void init( int num_octave, int max_extrema, int max_orientations ); + + /** Release all buffers */ + void uninit( ); + + /** Function that allows to resize buffers when all extrema have been + * detected and the resulting total is too large. + */ + void growBuffers( int numExtrema ); +}; class Pyramid { @@ -76,6 +157,12 @@ class Pyramid /* the download of converted descriptors should be asynchronous */ cudaStream_t _download_stream; + ExtremaCounters* _ct; + ExtremaBuffers* _buf; + + /** A structure that encapsulates everything we need for grid filtering */ + FilterGrid _fg; + public: enum GaussTableChoice { Interpolated_FromPrevious, @@ -144,9 +231,9 @@ class Pyramid void reset_extrema_mgmt( ); void build_pyramid( const Config& conf, ImageBase* base ); void find_extrema( const Config& conf ); + void finalize_extrema( ); void reallocExtrema( int numExtrema ); - int extrema_filter_grid( const Config& conf, int ext_total ); // called at head of orientation void orientation( const Config& conf ); void descriptors( const Config& conf ); diff --git a/testScripts/CMakeLists.txt b/testScripts/CMakeLists.txt index 411a8e30..28185847 100755 --- a/testScripts/CMakeLists.txt +++ b/testScripts/CMakeLists.txt @@ -26,3 +26,6 @@ add_custom_target( DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh DEPENDS popsift-demo ) + +add_subdirectory(compareVersions) + diff --git a/testScripts/compareVersions/CMakeLists.txt b/testScripts/compareVersions/CMakeLists.txt new file mode 100755 index 00000000..43af4c8b --- /dev/null +++ b/testScripts/compareVersions/CMakeLists.txt @@ -0,0 +1,46 @@ +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") + +find_program(AWK awk mawk gawk) +find_program(SORT sort) +find_program(GNUPLOT gnuplot) +find_program(CONVERT convert) +find_program(VLFEAT sift PATH_SUFFIXES glnxa64 bin/glnxa64) + +if(GNUPLOT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: gnupot could not be found (${GNUPLOT}).") +else() + message(STATUS "gnuplot found (${GNUPLOT}).") +endif() + +if(CONVERT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: ImageMagick's convert could not be found (${CONVERT}).") +else() + message(STATUS "convert found (${CONVERT}).") +endif() + +if(VLFEAT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: vlfeat could not be found (${VLFEAT}).") +else() + message(STATUS "vlfeat found (${VLFEAT}).") +endif() + +if(SORT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: sort could not be found (${SORT}). Compare tests will fail.") +else() + message(STATUS "sort found (${SORT}).") +endif() + +if(AWK MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: awk (and mawk and gawk) could not be found (${AWK}). Compare tests will fail") +else() + message(STATUS "awk found (${AWK}).") +endif() + +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/compareWithVLFeat.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/compareWithVLFeat.sh + @ONLY) + +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Compare.mk.in + Compare.mk + @ONLY) + diff --git a/testScripts/compareVersions/Compare.mk.in b/testScripts/compareVersions/Compare.mk.in new file mode 100755 index 00000000..ffa8f517 --- /dev/null +++ b/testScripts/compareVersions/Compare.mk.in @@ -0,0 +1,24 @@ +all: double + +single: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 popsift vlfeat + +double: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 vlfeat default -2 popsift vlfeat + +triple: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 vlfeat default -2 popsift default -3 popsift vlfeat + +clean: + rm -rf popsift-*-popsift-* + rm -rf vlfeat-*-vlfeat-* + rm -rf vlfeat-*-popsift-* + rm -rf popsift-*-vlfeat-* + rm -f sort*.txt + rm -f sort*.png + rm -f *.sift + rm -f descdist*.txt + rm -f descdist*.png + rm -f tmp/*.sift tmp/UML*.txt tmp/sort*.txt tmp/coord*.txt tmp/sum*.txt + rm -f tmp/descdist*.txt + diff --git a/testScripts/compareVersions/awk/compute.awk b/testScripts/compareVersions/awk/compute.awk new file mode 100644 index 00000000..aadd528d --- /dev/null +++ b/testScripts/compareVersions/awk/compute.awk @@ -0,0 +1,18 @@ +{ + printf("%f %f -- ", $1, $2); + total = 0 + for(i=5; i<=NF; i++) { total = total + $i; } + printf("%d -- ", total); + i = 5; + for(r=0; r<16; r++) + { + total = 0 + for(i=0; i<8; i++) + { + idx = 5 + r*8 + i + total = total + $idx; + } + printf("%d ", total/8); + } + printf("\n"); +} diff --git a/testScripts/compareVersions/awk/desc-to-heat.awk b/testScripts/compareVersions/awk/desc-to-heat.awk new file mode 100644 index 00000000..b84e7ef1 --- /dev/null +++ b/testScripts/compareVersions/awk/desc-to-heat.awk @@ -0,0 +1,23 @@ +BEGIN { + x = -1.5; + y = -1.5; +} + +{ + if( NF == 10 ) + { + printf("%.1f %.1f %f\n", x, y, $COL); + x = x + 1; + if( x > 2 ) + { + printf("\n"); + x = -1.5; + y = y + 1; + } + } +} + +END { + printf("\n"); +} + diff --git a/testScripts/compareVersions/compareWithVLFeat.sh.in b/testScripts/compareVersions/compareWithVLFeat.sh.in new file mode 100755 index 00000000..cbabc282 --- /dev/null +++ b/testScripts/compareVersions/compareWithVLFeat.sh.in @@ -0,0 +1,353 @@ +#!/bin/bash + +CMD_VLF=@VLFEAT@ +CMD_POP=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-demo +CMD_AWK=@AWK@ +CMD_SORT=@SORT@ +CMD_GP=@GNUPLOT@ +CMD_CONVERT=@CONVERT@ +CMD_COMPARE=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles +DIR_AWK=@CMAKE_CURRENT_SOURCE_DIR@/awk +DIR_TEST=@PopSift_TESTFILE_PATH@ + +VLFEAT_DEFAULT_PARAMETERS="" + +# VLFeat multiplies the descriptors with 512 before converting to uchar, +# so we use 9 for a 2**9 multiplier as well. +POPSIFT_DEFAULT_PARAMETERS="--pgmread-loading --norm-mode=classic --norm-multi 9 --write-as-uchar --write-with-ori --initial-blur 0.5 --sigma 1.6 --threshold 0 --edge-threshold 10.0" + +# FILES="hash level1 boat" +# FILES="level1" +# FILES="hash" +# FILES="boat1 boat" +# FILES="boat1" +FILES="boat" + +run_all_extract() { + local file="$1" + local testname="$2" + local srcfile="tmp/${file}-${testname}.sift" + local dstfile="tmp/sum-${file}-${testname}.txt" + local coordfile="tmp/coord-${file}-${testname}.txt" + + ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < ${srcfile} > ${coordfile} + ${CMD_AWK} -f ${DIR_AWK}/compute.awk ${srcfile} > ${dstfile} +} + +run_vlfeat() { + local testname="$1" + local file="$2" + local par="$3" + local dstfile="tmp/${file}-${testname}.sift" + + local PAR0="${VLFEAT_DEFAULT_PARAMETERS}" + + echo ${CMD_VLF} ${par} -v ${file}.pgm + ${CMD_VLF} ${PAR0} ${par} -v ${file}.pgm + ${CMD_SORT} -n ${file}.sift > ${dstfile} + rm ${file}.sift + run_all_extract $file $testname +} + +run_popsift() { + local testname="$1" + local file="$2" + local par="$3" + local dstfile="tmp/${file}-${testname}.sift" + + local PAR0="${POPSIFT_DEFAULT_PARAMETERS}" + + echo ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm + ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm + ${CMD_SORT} -n output-features.txt > ${dstfile} + rm output-features.txt + run_all_extract $file $testname +} + +prepare_testfiles() { + for file in $1 ; do + if [ ! -f ${file}.pgm ] + then + ${CMD_CONVERT} ${DIR_TEST}/$file/img1.ppm ${file}.pgm + if [ $? -ne 0 ] + then + echo " ${CMD_CONVERT} failed, test image not converted" + exit -1 + else + echo " converted ${file} to PGM" + fi + fi + done +} + +parse_commandline() { + local i=0 + while [ $# -ne 0 ] + do + # echo " #: $# p$i: p0=$0 p1=$1 p*=$*" + if [[ "$1" =~ ^\-[0-9]$ ]] ; then + shift 1 + if [ "$1" == "vlfeat" ] ; then + configurations["$i"]="vlfeat-$2" + echo "Valid configuration vlfeat-$2" + shift 1 + elif [ "$1" == "popsift" ] ; then + configurations["$i"]="popsift-$2" + echo "Valid configuration popsift-$2" + shift 1 + else + echo "Invalid configuration $1, skipping" + fi + elif [ $1 == "-h" ] ; then + echo "Usage: $0 [ - ]+" + echo " is a single-digital integer" + echo " is either 'vlfeat ' or 'popsift '" + echo " where for vlfeat is 'default'" + echo " and for popsift is 'default', 'loop' or 'vlfeat'" + exit 0 + fi + i=$((i+1)) + shift 1 + done +} + +compare_configurations() { + local left=$1 + local right=$2 + local dest="$left-$right" + mkdir -p ${dest}/tmp + mkdir -p ${dest} + + for file in ${FILES} ; do + echo " ----------------------------------------------------------------------------" + echo " BEGIN brute force matching ${file} for ${dest}" + echo " comparison tool: ${CMD_COMPARE}" + echo " left input: tmp/${file}-${left}.sift" + echo " right input: tmp/${file}-${right}.sift" + ${CMD_COMPARE} -o ${dest}/tmp/UML-${file}.txt \ + -d ${dest}/tmp/descdist-${file}.txt \ + tmp/${file}-${left}.sift \ + tmp/${file}-${right}.sift + + echo " END brute force matching ${file} for ${dest}" + echo " ----------------------------------------------------------------------------" + done +} + +sort_configuration() { + local dest=$1 + + for file in ${FILES} ; do + echo " Sorting ${dest} for ${file}" + ${CMD_SORT} -k3 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-1st-match.txt + ${CMD_SORT} -k6 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-pixdist.txt + ${CMD_SORT} -k8 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-angle.txt + ${CMD_SORT} -k10 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-2nd-match.txt + done +} + +make_distance_stats() { + local dest=$1 + local awkfile="${DIR_AWK}/desc-to-heat.awk" + local srcfile="${dest}/tmp/descdist-${file}.txt" + local dstfile="${dest}/descdist-${file}.txt" + + for file in ${FILES} ; do + echo " Converting descriptor distance stats" + awk -vCOL=3 -f ${awkfile} ${srcfile} > ${dstfile} + awk -vCOL=4 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=5 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=6 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=7 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=8 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=9 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=10 -f ${awkfile} ${srcfile} >> ${dstfile} + done +} + +make_plots() { + local dest=$1 + + for file in ${FILES} ; do + echo "Calling gnuplot (pixdist)" + echo "set title \"L2 distance between pixels, ${dest}" > cmd.gp + echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp + echo "set logscale y" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-pixdist.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-pixdist.txt\" using (\$6+0.00001) notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (1st dist)" + echo "set title \"L2 distance between descriptors, ${dest}" > cmd.gp + echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"/dev/null\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-1st-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"${dest}/tmp/sort-${file}-by-1st-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-1st-match.png\"" >> cmd.gp + echo "replot" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot for angular diff (1st dist)" + echo "set title \"Distance in degree between orientations, ${dest}" > cmd.gp + echo "set ylabel \"Difference (degree)\"" >> cmd.gp + echo "set xlabel \"Keypoint index sorted by orientation difference\"" >> cmd.gp + echo "set grid" >> cmd.gp + echo "set logscale y" >> cmd.gp + echo "set yrange [0.001:*]" >> cmd.gp + echo "set style data lines" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-angle.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-angle.txt\" using 8 notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (2nd dist)" + echo "set title \"L2 distance between descriptors, ${dest}" > cmd.gp + echo "set xlabel \"Keypoint index sorted by 2nd best match\"" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"/dev/null\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-2nd-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"${dest}/tmp/sort-${file}-by-2nd-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-2nd-match.png\"" >> cmd.gp + echo "replot" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (descriptor summary)" + echo "set view 80, 20, 1, 1.48" > cmd.gp + echo "set xrange [ -1.5 : 1.5 ]" >> cmd.gp + echo "set yrange [ -1.5 : 1.5 ]" >> cmd.gp + echo "set zrange [ -5 : 5 ]" >> cmd.gp + echo "set xtics -1.5,3 offset 0,-0.5" >> cmd.gp + echo "set ytics 1.5,3 offset 0.5" >> cmd.gp + echo "set ztics -3,3" >> cmd.gp + echo "set ticslevel 0" >> cmd.gp + echo "set format cb '%4.1f'" >> cmd.gp + echo "unset colorbox" >> cmd.gp + echo "set pm3d implicit at s" >> cmd.gp + echo "set terminal png size 1080, 250" >> cmd.gp + echo "set output \"${dest}/descdist-${file}.png\"" >> cmd.gp + echo "set multiplot layout 1,8 title '8 bins in each of 16 sections'" >> cmd.gp + echo "set title 'bin 0 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 0 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 1 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 1 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 2 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 2 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 3 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 3 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 4 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 4 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 5 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 5 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 6 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 6 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 7 '" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 7 using 1:2:3:3 with pm3d notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + rm -f cmd.gp + done +} + +mkdir -p tmp + +declare -A configurations +parse_commandline $* + +for i in "${!configurations[@]}" +do + echo " Conf $i : ${configurations[$i]}" +done + +prepare_testfiles "${FILES}" + +echo "################################################################################" +echo "All configurations: ${configurations[@]}" +echo "################################################################################" + +for TESTNAME in "${configurations[@]}" ; do + rm -f hash.sift* + rm -f level1.sift* + rm -f coord-* + rm -f sum-* + + for file in ${FILES} ; do + if [ "$TESTNAME" = "vlfeat-default" ]; then + echo "Test is vlfeat" + PAR1=" " + run_vlfeat ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-default" ]; then + echo "Test is popsift-default" + PAR1=" " + run_popsift ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-loop" ]; then + echo "Test is popsift-loop: --desc-mode=loop" + PAR1="--desc-mode=loop" + run_popsift ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-vlfeat" ]; then + echo "Test is popsift-vlfeat: --desc-mode=vlfeat" + PAR1="--desc-mode=vlfeat" + run_popsift ${TESTNAME} ${file} ${PAR1} + else + echo "Test is undefined, $TESTNAME" + exit + fi + done +done + +echo "################################################################################" +echo "Configurations valid for comparison" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + echo " $left vs $right" + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +echo "################################################################################" +echo "Performing brute-force matching" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + compare_configurations $left $right + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +echo "################################################################################" +echo "Creating output" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + sort_configuration "$left-$right" + make_distance_stats "$left-$right" + make_plots "$left-$right" + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +exit 0 +