From 0b7527698e58fd46130aa1c98e7e6edaaf1198fe Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 16 Oct 2020 11:57:18 +0200 Subject: [PATCH 01/30] [popsift] improve docs and options --- src/application/main.cpp | 47 +++++++++++++++-------- src/popsift/features.cu | 31 +++++++++++---- src/popsift/features.h | 4 +- src/popsift/sift_conf.cu | 83 +++++++++++++++++++++++++++++++--------- src/popsift/sift_conf.h | 58 ++++++++++++++++++++-------- src/popsift/sift_desc.cu | 7 ++-- 6 files changed, 167 insertions(+), 63 deletions(-) diff --git a/src/application/main.cpp b/src/application/main.cpp index 0eec1c22..689c5a01 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -42,6 +42,7 @@ using namespace std; static bool print_dev_info = false; static bool print_time_info = false; static bool write_as_uchar = false; +static bool write_with_ori = false; static bool dont_write = false; static bool pgmread_loading = false; static bool float_mode = false; @@ -62,15 +63,30 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in options_description parameters("Parameters"); { parameters.add_options() - ("octaves", value(&config.octaves), "Number of octaves") - ("levels", value(&config.levels), "Number of levels per octave") - ("sigma", value()->notifier([&](float f) { config.setSigma(f); }), "Initial sigma value") - - ("threshold", value()->notifier([&](float f) { config.setThreshold(f); }), "Contrast threshold") - ("edge-threshold", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("edge-limit", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("downsampling", value()->notifier([&](float f) { config.setDownsampling(f); }), "Downscale width and height of input by 2^N") - ("initial-blur", value()->notifier([&](float f) {config.setInitialBlur(f); }), "Assume initial blur, subtract when blurring first time"); + ("octaves", + value(&config.octaves)->default_value(config.getOctaves()), + "Number of octaves") + ("levels", + value(&config.levels)->default_value(config.getLevels()), + "Number of levels per octave") + ("sigma", + value()->notifier([&](float f) { config.setSigma(f); })->default_value(config.getSigma()), + "Initial sigma value") + ("threshold", + value()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()), + "Contrast threshold") + ("edge-threshold", + value()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()), + "On-edge threshold") + ("edge-limit", + value()->notifier([&](float f) { config.setEdgeLimit(f); }), + "On-edge threshold") + ("downsampling", + value()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()), + "Downscale width and height of input by 2^N") + ("initial-blur", + value()->notifier([&](float f) {config.setInitialBlur(f); })->default_value(config.getInitialBlur()), + "Assume initial blur, subtract when blurring first time"); } options_description modes("Modes"); { @@ -79,11 +95,8 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in popsift::Config::getGaussModeUsage() ) // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" - ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") + ( "desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), + popsift::Config::getDescModeUsage() ) ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") @@ -104,7 +117,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in ( "norm-mode", value()->notifier([&](const std::string& s) { config.setNormMode(s); }), popsift::Config::getNormModeUsage() ) ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), - popsift::Config::getNormModeUsage() ) + "synonym to --norm-mode=RootSift" ) ("filter-max-extrema", value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.") ("filter-grid", value()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") ("filter-sort", value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down"); @@ -118,6 +131,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in ("print-time-info", bool_switch(&print_time_info)->default_value(false), "A debug output printing image processing time after load()") ("write-as-uchar", bool_switch(&write_as_uchar)->default_value(false), "Output descriptors rounded to int.\n" "Scaling to sensible ranges is not automatic, should be combined with --norm-multi=9 or similar") + ("write-with-ori", bool_switch(&write_with_ori)->default_value(false), "Output points are written with sigma and orientation.\n") ("dont-write", bool_switch(&dont_write)->default_value(false), "Suppress descriptor output") ("pgmread-loading", bool_switch(&pgmread_loading)->default_value(false), "Use the old image loader instead of LibDevIL") ("float-mode", bool_switch(&float_mode)->default_value(false), "Upload image to GPU as float instead of byte") @@ -254,7 +268,7 @@ void read_job( SiftJob* job, bool really_write ) nvtxRangePushA( "Writing features to disk" ); std::ofstream of( "output-features.txt" ); - feature_list->print( of, write_as_uchar ); + feature_list->print( of, write_as_uchar, write_with_ori ); } delete feature_list; @@ -327,4 +341,3 @@ int main(int argc, char **argv) return EXIT_SUCCESS; } - diff --git a/src/popsift/features.cu b/src/popsift/features.cu index 023279ff..d789c441 100755 --- a/src/popsift/features.cu +++ b/src/popsift/features.cu @@ -107,16 +107,16 @@ void FeaturesHost::unpin( ) cudaHostUnregister( _ori ); } -void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar ) const +void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const { for( int i=0; ifeatures[i]) << " "; @@ -328,7 +345,7 @@ void Feature::print( std::ostream& ostr, bool write_as_uchar ) const std::ostream& operator<<( std::ostream& ostr, const Feature& feature ) { - feature.print( ostr, false ); + feature.print( ostr, false, false ); return ostr; } diff --git a/src/popsift/features.h b/src/popsift/features.h index 3b16f954..a05a8197 100755 --- a/src/popsift/features.h +++ b/src/popsift/features.h @@ -33,7 +33,7 @@ struct Feature float orientation[ORIENTATION_MAX_COUNT]; Descriptor* desc[ORIENTATION_MAX_COUNT]; - void print( std::ostream& ostr, bool write_as_uchar ) const; + void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const; }; std::ostream& operator<<( std::ostream& ostr, const Feature& feature ); @@ -91,7 +91,7 @@ class FeaturesHost : public FeaturesBase inline Feature* getFeatures() { return _ext; } inline Descriptor* getDescriptors() { return _ori; } - void print( std::ostream& ostr, bool write_as_uchar ) const; + void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const; protected: friend class Pyramid; diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index 251f58ff..db2bd42a 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -9,9 +9,17 @@ #include "sift_conf.h" #include +#include using namespace std; +static bool stringIsame( string l, string r ) +{ + std::for_each( l.begin(), l.end(), [](char& c) { c = ::tolower(c); }); + std::for_each( r.begin(), r.end(), [](char& c) { c = ::tolower(c); }); + return l == r; +} + namespace popsift { @@ -81,6 +89,25 @@ void Config::setDescMode( Config::DescMode m ) _desc_mode = m; } +const char* Config::getDescModeUsage( ) +{ + return "Choice of descriptor extraction modes:\n" + "loop, iloop, grid, igrid, notile, vlfeat\n" + "Default is loop\n" + "loop is OpenCV-like horizontal scanning, sampling every pixel in a radius around the " + "centers or the 16 tiles arond the keypoint. Each sampled point contributes to two " + "histogram bins." + "iloop is like loop but samples all constant 1-pixel distances from the keypoint, " + "using the CUDA texture engine for interpolation. " + "grid is like loop but works on rotated, normalized tiles, relying on CUDA 2D cache " + "to replace the manual data aligment idea of loop. " + "igrid iloop and grid. " + "notile is like igrid but handles all 16 tiles at once.\n" + "vlfeat is VLFeat-like horizontal scanning, sampling every pixel in a radius around " + "keypoint itself, using the 16 tile centers only for weighting. Every sampled point " + "contributes to up to eight historgram bins."; +} + void Config::setGaussMode( const std::string& m ) { if( m == "vlfeat" ) @@ -196,15 +223,25 @@ void Config::setNormMode( Config::NormMode m ) void Config::setNormMode( const std::string& m ) { - if( m == "RootSift" ) setNormMode( Config::RootSift ); - else if( m == "classic" ) setNormMode( Config::Classic ); + if( stringIsame( m, "RootSift" ) ) + { + setNormMode( Config::RootSift ); + } + else if( stringIsame( m, "L2" ) ) + { + setNormMode( Config::Classic ); + } + else if( stringIsame( m, "Classic" ) ) + { + setNormMode( Config::Classic ); + } else POP_FATAL( string("Bad Normalization mode.\n") + getGaussModeUsage() ); } Config::NormMode Config::getNormModeDefault( ) { - return Config::RootSift; + return Config::NormDefault; } const char* Config::getNormModeUsage( ) @@ -232,12 +269,24 @@ int Config::getNormalizationMultiplier( ) const return _normalization_multiplier; } -void Config::setDownsampling( float v ) { _upscale_factor = -v; } +void Config::setDownsampling( float v ) { _upscale_factor = -v; } +float Config::getDownsampling( ) const { return -_upscale_factor; } + void Config::setOctaves( int v ) { octaves = v; } +int Config::getOctaves( ) const { return octaves; } + void Config::setLevels( int v ) { levels = v; } -void Config::setSigma( float v ) { sigma = v; } -void Config::setEdgeLimit( float v ) { _edge_limit = v; } -void Config::setThreshold( float v ) { _threshold = v; } +int Config::getLevels( ) const { return levels; } + +void Config::setSigma( float v ) { sigma = v; } +float Config::getSigma( ) const { return sigma; } + +void Config::setEdgeLimit( float v ) { _edge_limit = v; } +float Config::getEdgeLimit( ) const { return _edge_limit; } + +void Config::setThreshold( float v ) { _threshold = v; } +float Config::getThreshold( ) const { return _threshold; } + void Config::setPrintGaussTables() { _print_gauss_tables = true; } void Config::setFilterMaxExtrema( int ext ) { _filter_max_extrema = ext; } void Config::setFilterGridSize( int sz ) { _filter_grid_size = sz; } @@ -252,25 +301,24 @@ void Config::setInitialBlur( float blur ) _initial_blur = blur; } } - -Config::GaussMode Config::getGaussMode( ) const +bool Config::hasInitialBlur( ) const { - return _gauss_mode; + return _assume_initial_blur; } - -Config::SiftMode Config::getSiftMode() const +float Config::getInitialBlur( ) const { - return _sift_mode; + return _initial_blur; } -bool Config::hasInitialBlur( ) const + +Config::GaussMode Config::getGaussMode( ) const { - return _assume_initial_blur; + return _gauss_mode; } -float Config::getInitialBlur( ) const +Config::SiftMode Config::getSiftMode() const { - return _initial_blur; + return _sift_mode; } float Config::getPeakThreshold() const @@ -304,4 +352,3 @@ bool Config::equal( const Config& other ) const } }; // namespace popsift - diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h index 583a958c..0d2ad330 100644 --- a/src/popsift/sift_conf.h +++ b/src/popsift/sift_conf.h @@ -74,6 +74,7 @@ struct Config */ enum ScalingMode { + /// Experimental, non-working mode - scale direct from input ScaleDirect, /// Indirect - only working method ScaleDefault @@ -84,16 +85,16 @@ struct Config */ enum DescMode { - /// scan horizontal, extract valid points + /// scan horizontal, extract valid points - weight goes into 2 histogram bins Loop, - /// scan horizontal, extract valid points, interpolate with tex engine + /// loop-compatible; scan horizontal, extract valid points, interpolate with tex engine ILoop, - /// scan in rotated mode, round pixel address + /// loop-compatible; scan in rotated mode, round pixel address Grid, - /// scan in rotated mode, interpolate with tex engine + /// loop-compatible; scan in rotated mode, interpolate with tex engine IGrid, - /// variant of IGrid, no duplicate gradient fetching - NoTile + /// loop-compatible; variant of IGrid, no duplicate gradient fetching + NoTile, }; /** @@ -104,7 +105,9 @@ struct Config /// The L1-inspired norm, gives better matching results ("RootSift") RootSift, /// The L2-inspired norm, all descriptors on a hypersphere ("classic") - Classic + Classic, + /// The current default value + NormDefault = RootSift }; /** @@ -160,6 +163,12 @@ struct Config * @see LogMode */ void setLogMode( LogMode mode = All ); + + /** + * @brief Set the scaling mode. + * @param mode The scaling mode + * @see ScalingMode + */ void setScalingMode( ScalingMode mode = ScaleDefault ); /** @@ -182,16 +191,36 @@ struct Config */ void setDescMode( DescMode mode = Loop ); + /** + * @brief Helper functions for the main program's usage string. + */ + static const char* getDescModeUsage( ); + // void setGaussGroup( int groupsize ); // int getGaussGroup( ) const; - void setDownsampling( float v ); + void setDownsampling( float v ); + float getDownsampling( ) const; + void setOctaves( int v ); + int getOctaves( ) const; + void setLevels( int v ); - void setSigma( float v ); - void setEdgeLimit( float v ); - void setThreshold( float v ); - void setInitialBlur( float blur ); + int getLevels( ) const; + + void setSigma( float v ); + float getSigma( ) const; + + void setEdgeLimit( float v ); + float getEdgeLimit( ) const; + + void setThreshold( float v ); + float getThreshold( ) const; + + void setInitialBlur( float blur ); + bool hasInitialBlur( ) const; + float getInitialBlur( ) const; + // void setMaxExtreme( int m ); void setPrintGaussTables( ); // void setDPOrientation( bool on ); @@ -200,9 +229,6 @@ struct Config void setFilterSorting( const std::string& direction ); void setFilterSorting( GridFilterMode m ); - bool hasInitialBlur( ) const; - float getInitialBlur( ) const; - /// computes the actual peak threshold depending on the threshold /// parameter and the non-augmented number of levels float getPeakThreshold() const; @@ -321,7 +347,7 @@ struct Config /** * @brief Get the scaling mode. - * @return the descriptor extraction mode. + * @return the extraction mode. * @see ScalingMode */ inline ScalingMode getScalingMode() const { return _scaling_mode; } diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index b0eb0bd1..2b82b771 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -88,16 +88,17 @@ void Pyramid::descriptors( const Config& conf ) if( hct.ori_total == 0 ) { cerr << "Warning: no descriptors extracted" << endl; - return; + return; } dim3 block; - dim3 grid; - grid.x = grid_divide( hct.ori_total, 32 ); block.x = 32; block.y = 32; block.z = 1; + dim3 grid; + grid.x = grid_divide( hct.ori_total, block.y ); + if( conf.getUseRootSift() ) { normalize_histogram <<>> ( ); POP_SYNC_CHK; From ed60e199f99130a3b949f7f12ebb0c2fab801730 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 16 Oct 2020 12:02:42 +0200 Subject: [PATCH 02/30] [bugfix] fix bug in L2 normalization --- src/popsift/s_desc_norm_l2.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h index 3a7ed858..96a161d4 100644 --- a/src/popsift/s_desc_norm_l2.h +++ b/src/popsift/s_desc_norm_l2.h @@ -58,15 +58,15 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool float norm; if( threadIdx.x == 0 ) { - norm = normf( 128, src_desc ); + norm = rnormf( 128, src_desc ); // 1/sqrt(sum of squares) } __syncthreads(); norm = popsift::shuffle( norm, 0 ); - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); + descr.x = min( descr.x*norm, 0.2f ); + descr.y = min( descr.y*norm, 0.2f ); + descr.z = min( descr.z*norm, 0.2f ); + descr.w = min( descr.w*norm, 0.2f ); norm = descr.x * descr.x + descr.y * descr.y @@ -96,14 +96,14 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); if( threadIdx.x == 0 ) { - norm = __fsqrt_rn( norm ); + norm = __frsqrt_rn( norm ); } norm = popsift::shuffle( norm, 0 ); - descr.x = min( descr.x, 0.2f*norm ); - descr.y = min( descr.y, 0.2f*norm ); - descr.z = min( descr.z, 0.2f*norm ); - descr.w = min( descr.w, 0.2f*norm ); + descr.x = min( descr.x*norm, 0.2f ); + descr.y = min( descr.y*norm, 0.2f ); + descr.z = min( descr.z*norm, 0.2f ); + descr.w = min( descr.w*norm, 0.2f ); norm = descr.x * descr.x + descr.y * descr.y From b13d2596dcee2275b1917600bb4891af41136bff Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 16 Oct 2020 12:04:15 +0200 Subject: [PATCH 03/30] [cosmetic] improve normalize_histogram --- src/popsift/s_desc_normalize.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/popsift/s_desc_normalize.h b/src/popsift/s_desc_normalize.h index a87d0710..da711026 100644 --- a/src/popsift/s_desc_normalize.h +++ b/src/popsift/s_desc_normalize.h @@ -18,17 +18,17 @@ void normalize_histogram( ) Descriptor* descs = dbuf.desc; const int num_orientations = dct.ori_total; - int offset = blockIdx.x * 32 + threadIdx.y; + int offset = blockIdx.x * blockDim.y + threadIdx.y; // all of these threads are useless - if( blockIdx.x * 32 >= num_orientations ) return; + if( blockIdx.x * blockDim.y >= num_orientations ) return; - offset = ( offset < num_orientations ) ? offset - : num_orientations-1; - Descriptor* desc = &descs[offset]; + offset = min( offset, num_orientations-1 ); + + float* features = descs[offset].features; bool ignoreme = ( offset >= num_orientations ); - T::normalize( desc->features, ignoreme ); + T::normalize( features, ignoreme ); } From 2956630257eaec22e31a49fced3e74892c3ff9c4 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 16 Oct 2020 12:13:27 +0200 Subject: [PATCH 04/30] [popsift] change orientation computation to get closer to VLFeat --- src/popsift/s_orientation.cu | 118 ++++++++++++----------------------- 1 file changed, 41 insertions(+), 77 deletions(-) diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index f6b36fcd..96d43049 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -28,36 +28,14 @@ using namespace popsift; using namespace std; -/* Smoothing like VLFeat is the default mode. - * If you choose to undefine it, you get the smoothing approach taken by OpenCV - */ -#define WITH_VLFEAT_SMOOTHING - namespace popsift { -__device__ -inline float compute_angle( int bin, float hc, float hn, float hp ) -{ - /* interpolate */ - float di = bin + 0.5f * (hn - hp) / (hc+hc-hn-hp); - - /* clamp */ - di = (di < 0) ? - (di + ORI_NBINS) : - ((di >= ORI_NBINS) ? (di - ORI_NBINS) : (di)); - - float th = __fdividef( M_PI2 * di, ORI_NBINS ) - M_PI; - // float th = ((M_PI2 * di) / ORI_NBINS); - return th; -} - /* * Histogram smoothing helper */ -template -__device__ -inline static float smoothe( const float* const src, const int bin ) +__device__ inline static +float smoothe( const float* const src, const int bin ) { const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1; const int next = (bin == ORI_NBINS-1) ? 0 : bin+1; @@ -102,7 +80,7 @@ void ori_par( const int octave, /* orientation histogram radius */ const float sigw = ORI_WINFACTOR * sig; - const int32_t rad = (int)roundf((3.0f * sigw)); + const int32_t rad = max( (int)floorf((3.0f * sigw)), 1 ); const float factor = __fdividef( -0.5f, (sigw * sigw) ); const int sq_thres = rad * rad; @@ -111,10 +89,10 @@ void ori_par( const int octave, // int xmax = min(w - 2, (int)floor(x + rad)); // int ymin = max(1, (int)floor(y - rad)); // int ymax = min(h - 2, (int)floor(y + rad)); - int xmin = max(1, (int)roundf(x) - rad); - int xmax = min(w - 2, (int)roundf(x) + rad); - int ymin = max(1, (int)roundf(y) - rad); - int ymax = min(h - 2, (int)roundf(y) + rad); + int xmin = max(0, (int)roundf(x) - rad); + int xmax = min(w - 1, (int)roundf(x) + rad); + int ymin = max(0, (int)roundf(y) - rad); + int ymax = min(h - 1, (int)roundf(y) + rad); int wx = xmax - xmin + 1; int hy = ymax - ymin + 1; @@ -136,25 +114,21 @@ void ori_par( const int octave, layer, level ); + grad /= 2; // our grad is twice that of VLFeat - weird + if( theta < 0 ) theta += M_PI2; + float dx = xx - x; float dy = yy - y; - int sq_dist = dx * dx + dy * dy; - if (sq_dist <= sq_thres) + float sq_dist = dx * dx + dy * dy; + if (sq_dist <= sq_thres + 0.6f) { float weight = grad * expf(sq_dist * factor); - // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) ); int bidx = (int)roundf( __fdividef( float(ORI_NBINS) * (theta + M_PI), M_PI2 ) ); - if( bidx > ORI_NBINS ) { - printf("Crashing: bin %d theta %f :-)\n", bidx, theta); - } - if( bidx < 0 ) { - printf("Crashing: bin %d theta %f :-)\n", bidx, theta); - } - - bidx = (bidx == ORI_NBINS) ? 0 : bidx; + while( bidx < 0 ) bidx += ORI_NBINS; + while( bidx >= ORI_NBINS ) bidx -= ORI_NBINS; atomicAdd( &hist[bidx], weight ); } @@ -162,62 +136,52 @@ void ori_par( const int octave, } __syncthreads(); -#ifdef WITH_VLFEAT_SMOOTHING for( int i=0; i<3 ; i++ ) { - sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 ); - sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 ); + sm_hist[threadIdx.x+ 0] = smoothe( hist, threadIdx.x+ 0 ); + sm_hist[threadIdx.x+32] = smoothe( hist, threadIdx.x+32 ); __syncthreads(); - hist[threadIdx.x+ 0] = smoothe<2>( sm_hist, threadIdx.x+ 0 ); - hist[threadIdx.x+32] = smoothe<3>( sm_hist, threadIdx.x+32 ); + hist[threadIdx.x+ 0] = smoothe( sm_hist, threadIdx.x+ 0 ); + hist[threadIdx.x+32] = smoothe( sm_hist, threadIdx.x+32 ); __syncthreads(); } sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0]; sm_hist[threadIdx.x+32] = hist[threadIdx.x+32]; __syncthreads(); -#else // not WITH_VLFEAT_SMOOTHING - for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) { - int prev2 = bin - 2; - int prev1 = bin - 1; - int next1 = bin + 1; - int next2 = bin + 2; - if( prev2 < 0 ) prev2 += ORI_NBINS; - if( prev1 < 0 ) prev1 += ORI_NBINS; - if( next1 >= ORI_NBINS ) next1 -= ORI_NBINS; - if( next2 >= ORI_NBINS ) next2 -= ORI_NBINS; - sm_hist[bin] = ( hist[prev2] + hist[next2] - + ( hist[prev1] + hist[next1] ) * 4.0f - + hist[bin] * 6.0f ) / 16.0f; - } - __syncthreads(); -#endif // not WITH_VLFEAT_SMOOTHING + + if( threadIdx.x+32 >= ORI_NBINS ) sm_hist[threadIdx.x+32] = -INFINITY; + float maxval = fmaxf( sm_hist[threadIdx.x+ 0], sm_hist[threadIdx.x+32] ); + float neigh; + neigh = popsift::shuffle_down( maxval, 16 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 8 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 4 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 2 ); maxval = fmaxf( maxval, neigh ); + neigh = popsift::shuffle_down( maxval, 1 ); maxval = fmaxf( maxval, neigh ); + maxval = popsift::shuffle( maxval, 0 ); // sub-cell refinement of the histogram cell index, yielding the angle // not necessary to initialize, every cell is computed - for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) { - const int prev = bin == 0 ? ORI_NBINS-1 : bin-1; - const int next = bin == ORI_NBINS-1 ? 0 : bin+1; + for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) + { + const int prev = ( bin - 1 + ORI_NBINS ) % ORI_NBINS; + const int next = ( bin + 1 ) % ORI_NBINS; - bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ); + bool predicate = ( bin < ORI_NBINS ) && + ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ) && + ( sm_hist[bin] > 0.8f * maxval ); const float num = predicate ? 3.0f * sm_hist[prev] - 4.0f * sm_hist[bin] + 1.0f * sm_hist[next] : 0.0f; - // const float num = predicate ? 2.0f * sm_hist[prev] - // - 4.0f * sm_hist[bin] - // + 2.0f * sm_hist[next] - // : 0.0f; const float denB = predicate ? 2.0f * ( sm_hist[prev] - 2.0f * sm_hist[bin] + sm_hist[next] ) : 1.0f; const float newbin = __fdividef( num, denB ); // verified: accuracy OK - predicate = ( predicate && newbin >= 0.0f && newbin <= 2.0f ); - refined_angle[bin] = predicate ? prev + newbin : -1; - yval[bin] = predicate ? -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY; + yval[bin] = predicate ? -(num*num) / (4.0f * denB) + hist[prev] : -INFINITY; } __syncthreads(); @@ -229,9 +193,7 @@ void ori_par( const int octave, // All threads retrieve the yval of thread 0, the largest // of all yvals. - const float best_val = yval[best_index.x]; - const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 ); - const bool valid = ( best_val >= yval_ref ); + const bool valid = ( yval[best_index.x] > 0 ); bool written = false; Extremum* ext = &dobuf.extrema[ext_ct_prefix_sum + extremum_index]; @@ -240,8 +202,10 @@ void ori_par( const int octave, if( valid ) { float chosen_bin = refined_angle[best_index.x]; if( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS; - // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI; - float th = ::fmaf( M_PI2 * chosen_bin, 1.0f/ORI_NBINS, - M_PI ); + if( chosen_bin < 0 ) chosen_bin += ORI_NBINS; + float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS); // - M_PI; + th -= M_PI; + if( th < 0.0f ) th += M_PI2; ext->orientation[threadIdx.x] = th; written = true; } From 7be0e8970c02ee2e6cc5505aba50f8a2418318c3 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 16 Oct 2020 12:08:08 +0200 Subject: [PATCH 05/30] [popsift] add VLFeat-like descriptor extraction --- src/CMakeLists.txt | 1 + src/popsift/s_desc_vlfeat.cu | 205 +++++++++++++++++++++++++++++++++++ src/popsift/s_desc_vlfeat.h | 18 +++ src/popsift/sift_conf.cu | 2 + src/popsift/sift_conf.h | 4 + src/popsift/sift_desc.cu | 3 + 6 files changed, 233 insertions(+) create mode 100644 src/popsift/s_desc_vlfeat.cu create mode 100644 src/popsift/s_desc_vlfeat.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0380dd41..db7aeadc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,6 +23,7 @@ CUDA_ADD_LIBRARY(popsift popsift/s_desc_grid.cu popsift/s_desc_grid.h popsift/s_desc_igrid.cu popsift/s_desc_igrid.h popsift/s_desc_notile.cu popsift/s_desc_notile.h + popsift/s_desc_vlfeat.cu popsift/s_desc_vlfeat.h popsift/s_desc_norm_rs.h popsift/s_desc_norm_l2.h popsift/s_desc_normalize.h diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu new file mode 100644 index 00000000..9d4ab995 --- /dev/null +++ b/src/popsift/s_desc_vlfeat.cu @@ -0,0 +1,205 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "common/assist.h" +#include "common/debug_macros.h" +#include "common/vec_macros.h" +#include "s_desc_vlfeat.h" +#include "s_gradiant.h" +#include "sift_constants.h" +#include "sift_pyramid.h" + +#include + +using namespace popsift; + +__device__ static inline +void ext_desc_vlfeat_sub( const float ang, + const Extremum* ext, + float* __restrict__ features, + cudaTextureObject_t layer_tex, + const int width, + const int height ) +{ + const float x = ext->xpos; + const float y = ext->ypos; + const int level = ext->lpos; // old_level; + const float sig = ext->sigma; + const float SBP = fabsf(DESC_MAGNIFY * sig); + + if( SBP == 0 ) { + return; + } + + float cos_t; + float sin_t; + __sincosf( ang, &sin_t, &cos_t ); + + const float csbp = cos_t * SBP; + const float ssbp = sin_t * SBP; + const float crsbp = cos_t / SBP; + const float srsbp = sin_t / SBP; + + // We have 4x4*16 bins. + // There centers have the offsets -1.5, -0.5, 0.5, 1.5 from the + // keypoint. The points that support them stretch from -2 to 2 + const float2 maxdist = make_float2( -2.0f, -2.0f ); + + // We rotate the corner of the maximum range by the keypoint orientation. + const float ptx = fabsf( ::fmaf( csbp, maxdist.x, ::fmaf( -ssbp, maxdist.y, x )) ); + const float pty = fabsf( ::fmaf( csbp, maxdist.y, ::fmaf( ssbp, maxdist.x, y ) ) ); + + const float bsz = fabsf(csbp) + fabsf(ssbp); + const int xmin = max(1, (int)floorf(x - ptx - bsz)); + const int ymin = max(1, (int)floorf(y - pty - bsz)); + const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); + const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); + + __shared__ float dpt[4][128]; + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + dpt[0][i] = 0.0f; + dpt[1][i] = 0.0f; + dpt[2][i] = 0.0f; + dpt[3][i] = 0.0f; + } + __syncthreads(); + + // we have 32 threads in a warp, how to use them? + const int tx = ( ( threadIdx.x >> 0 ) & 0x1 ); // 0 - 1 + const int ty = ( ( threadIdx.x >> 1 ) & 0x1 ); // 0 - 1 + const int tt = ( ( threadIdx.x >> 2 ) & 0x1 ); // 0 - 1 + const int loop = threadIdx.x >> 3; // 0 - 3 + + for( int pix_y = ymin+loop; popsift::any(pix_y <= ymax); pix_y += 4 ) + { + for( int pix_x = xmin; pix_x <= xmax; pix_x++ ) + { + if( pix_y <= ymax ) + { + // d : distance from keypoint + const float2 d = make_float2( pix_x - x, pix_y - y ); + + // n : normalized distance from keypoint + const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), + ::fmaf( crsbp, d.y, -srsbp * d.x ) ); + + float mod; + float th; + + get_gradiant( mod, th, pix_x, pix_y, layer_tex, level ); + + mod /= 2; // Our mod is double that of vlfeat. Huh. + + const float ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3)); + + th -= ang; + while( th > M_PI2 ) th -= M_PI2; + while( th < 0.0f ) th += M_PI2; + + const float nt = 8.0f * th / M_PI2; + + // neighbouring tile on the lower side: -2, -1, 0 or 1 + // (must use floorf because casting rounds towards zero + const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), + (int)floorf(n.y - 0.5f), + (int)nt ); + float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); + float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); + float wgt_t = - ( nt - t0.z ); + + if( ( t0.y + ty >= -2 ) && + ( t0.y + ty < 2 ) && + ( t0.x + tx >= -2 ) && + ( t0.x + tx < 2 ) ) + { + wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; + wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; + wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; + + wgt_x = fabsf( wgt_x ); + wgt_y = fabsf( wgt_y ); + wgt_t = fabsf( wgt_t ); + + const float val = ww + * mod + * wgt_x + * wgt_y + * wgt_t; + + const int offset = 80 + + ( t0.y + ty ) * 32 + + ( t0.x + tx ) * 8 + + ( t0.z + tt ) % 8; + + dpt[loop][offset] += val; + } + } + __syncthreads(); + } + } + + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + float f = dpt[0][i] + dpt[1][i] + dpt[2][i] + dpt[3][i]; + features[i] = f; + } +} + +__global__ void ext_desc_vlfeat(int octave, cudaTextureObject_t layer_tex, int w, int h) +{ + const int o_offset = dct.ori_ps[octave] + blockIdx.x; + Descriptor* desc = &dbuf.desc [o_offset]; + const int ext_idx = dobuf.feat_to_ext_map[o_offset]; + Extremum* ext = dobuf.extrema + ext_idx; + + const int ext_base = ext->idx_ori; + const int ori_num = o_offset - ext_base; + const float ang = ext->orientation[ori_num]; + + ext_desc_vlfeat_sub( ang, + ext, + desc->features, + layer_tex, + w, + h ); +} + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = hct.ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 32; + block.y = 1; + block.z = 1; + + size_t shared_size = 4 * 128 * sizeof(float); + + ext_desc_vlfeat + <<>> + ( octave, + oct_obj.getDataTexPoint( ), + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift + diff --git a/src/popsift/s_desc_vlfeat.h b/src/popsift/s_desc_vlfeat.h new file mode 100644 index 00000000..4370bc7b --- /dev/null +++ b/src/popsift/s_desc_vlfeat.h @@ -0,0 +1,18 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "sift_octave.h" + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ); + +}; // namespace popsift + diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index db2bd42a..aedbd458 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -80,6 +80,8 @@ void Config::setDescMode( const std::string& text ) setDescMode( Config::IGrid ); else if( text == "notile" ) setDescMode( Config::NoTile ); + else if( text == "vlfeat" ) + setDescMode( Config::VLFeat_Desc ); else POP_FATAL( "specified descriptor extraction mode must be one of loop, grid or igrid" ); } diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h index 0d2ad330..9f78d12b 100644 --- a/src/popsift/sift_conf.h +++ b/src/popsift/sift_conf.h @@ -95,6 +95,10 @@ struct Config IGrid, /// loop-compatible; variant of IGrid, no duplicate gradient fetching NoTile, + /** extraction code according to VLFeat, similar to loop, weight goes into + * up to 8 histogram bins + */ + VLFeat_Desc }; /** diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index 2b82b771..f12f85fc 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -13,6 +13,7 @@ #include "s_desc_loop.h" #include "s_desc_normalize.h" #include "s_desc_notile.h" +#include "s_desc_vlfeat.h" #include "s_gradiant.h" #include "sift_config.h" #include "sift_constants.h" @@ -77,6 +78,8 @@ void Pyramid::descriptors( const Config& conf ) start_ext_desc_igrid( octave, oct_obj ); } else if( conf.getDescMode() == Config::NoTile ) { start_ext_desc_notile( octave, oct_obj ); + } else if( conf.getDescMode() == Config::VLFeat_Desc ) { + start_ext_desc_vlfeat( octave, oct_obj ); } else { POP_FATAL( "not yet" ); } From 5a295bbc37f63661a340b3704d81e8840ec56214 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sat, 6 Mar 2021 23:21:05 +0100 Subject: [PATCH 06/30] [popsift] vlfeat descriptor area corrected --- src/popsift/s_desc_vlfeat.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 9d4ab995..654cd141 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -51,15 +51,17 @@ void ext_desc_vlfeat_sub( const float ang, const float2 maxdist = make_float2( -2.0f, -2.0f ); // We rotate the corner of the maximum range by the keypoint orientation. - const float ptx = fabsf( ::fmaf( csbp, maxdist.x, ::fmaf( -ssbp, maxdist.y, x )) ); - const float pty = fabsf( ::fmaf( csbp, maxdist.y, ::fmaf( ssbp, maxdist.x, y ) ) ); + // const float ptx = csbp * maxdist - ssbp * maxdist; + // const float pty = csbp * maxdist + ssbp * maxdist; + const float ptx = fabsf( ::fmaf( csbp, maxdist.x, -ssbp * maxdist.y ) ); + const float pty = fabsf( ::fmaf( csbp, maxdist.y, ssbp * maxdist.x ) ); + + const float bsz = 2.0f * ( fabsf(csbp) + fabsf(ssbp) ); - const float bsz = fabsf(csbp) + fabsf(ssbp); const int xmin = max(1, (int)floorf(x - ptx - bsz)); const int ymin = max(1, (int)floorf(y - pty - bsz)); const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); - __shared__ float dpt[4][128]; for( int i=threadIdx.x; i<128; i+=blockDim.x ) { From f815fbb544444ca36950d1db878f4f1c66846226 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sun, 7 Mar 2021 22:59:09 +0100 Subject: [PATCH 07/30] [popsift] speedup vlfeat descriptor --- src/popsift/s_desc_vlfeat.cu | 74 ++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 654cd141..375fb7eb 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -73,16 +73,15 @@ void ext_desc_vlfeat_sub( const float ang, __syncthreads(); // we have 32 threads in a warp, how to use them? - const int tx = ( ( threadIdx.x >> 0 ) & 0x1 ); // 0 - 1 - const int ty = ( ( threadIdx.x >> 1 ) & 0x1 ); // 0 - 1 - const int tt = ( ( threadIdx.x >> 2 ) & 0x1 ); // 0 - 1 const int loop = threadIdx.x >> 3; // 0 - 3 + const int xstart = ( threadIdx.x & 0x7 ); + const int xstep = 8; for( int pix_y = ymin+loop; popsift::any(pix_y <= ymax); pix_y += 4 ) { - for( int pix_x = xmin; pix_x <= xmax; pix_x++ ) + for( int pix_x = xmin+xstart; popsift::any(pix_x <= xmax); pix_x+=xstep ) { - if( pix_y <= ymax ) + if( ( pix_y <= ymax ) && ( pix_x <= xmax ) ) { // d : distance from keypoint const float2 d = make_float2( pix_x - x, pix_y - y ); @@ -111,35 +110,44 @@ void ext_desc_vlfeat_sub( const float ang, const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), (int)floorf(n.y - 0.5f), (int)nt ); - float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); - float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); - float wgt_t = - ( nt - t0.z ); - - if( ( t0.y + ty >= -2 ) && - ( t0.y + ty < 2 ) && - ( t0.x + tx >= -2 ) && - ( t0.x + tx < 2 ) ) + const float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); + const float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); + const float wgt_t = - ( nt - t0.z ); + + for( int tx=0; tx<2; tx++ ) { - wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; - wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; - wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; - - wgt_x = fabsf( wgt_x ); - wgt_y = fabsf( wgt_y ); - wgt_t = fabsf( wgt_t ); - - const float val = ww - * mod - * wgt_x - * wgt_y - * wgt_t; - - const int offset = 80 - + ( t0.y + ty ) * 32 - + ( t0.x + tx ) * 8 - + ( t0.z + tt ) % 8; - - dpt[loop][offset] += val; + for( int ty=0; ty<2; ty++ ) + { + for( int tt=0; tt<2; tt++ ) + { + if( ( t0.y + ty >= -2 ) && + ( t0.y + ty < 2 ) && + ( t0.x + tx >= -2 ) && + ( t0.x + tx < 2 ) ) + { + float i_wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; + float i_wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; + float i_wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; + + i_wgt_x = fabsf( i_wgt_x ); + i_wgt_y = fabsf( i_wgt_y ); + i_wgt_t = fabsf( i_wgt_t ); + + const float val = ww + * mod + * i_wgt_x + * i_wgt_y + * i_wgt_t; + + const int offset = 80 + + ( t0.y + ty ) * 32 + + ( t0.x + tx ) * 8 + + ( t0.z + tt ) % 8; + + atomicAdd( &dpt[loop][offset], val ); + } + } + } } } __syncthreads(); From bb901d7091b43b36db090dc52300291df5c5d83e Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sun, 14 Feb 2021 11:39:30 +0100 Subject: [PATCH 08/30] [popsift] configuration to ignore edge and peak thresholds --- src/application/main.cpp | 6 +++--- src/popsift/s_extrema.cu | 17 ++++++++++++----- src/popsift/sift_conf.cu | 39 ++++++++++++++++++++++++++++++++------- src/popsift/sift_conf.h | 22 ++++++++++++++++------ 4 files changed, 63 insertions(+), 21 deletions(-) diff --git a/src/application/main.cpp b/src/application/main.cpp index 689c5a01..e20c376f 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -74,13 +74,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in "Initial sigma value") ("threshold", value()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()), - "Contrast threshold") + popsift::Config::getPeakThreshUsage().c_str() ) ("edge-threshold", value()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()), - "On-edge threshold") + popsift::Config::getEdgeThreshUsage().c_str() ) ("edge-limit", value()->notifier([&](float f) { config.setEdgeLimit(f); }), - "On-edge threshold") + "synonym to --edge-threshold" ) ("downsampling", value()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()), "Downscale width and height of input by 2^N") diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index 5c1acc44..27d8c15b 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -482,14 +482,21 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, /* accept-reject extremum */ // if( fabsf(contr) < (d_consts.threshold*2.0f) ) - if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) ) + if( d_consts.threshold > 0.0f ) { - return false; + if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) ) + { + return false; + } } - /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */ - if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit ) { - return false; + if( d_consts.edge_limit > 0.0f ) + { + /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */ + if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit ) + { + return false; + } } ec.xpos = xn; diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index aedbd458..b96901f7 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -9,6 +9,7 @@ #include "sift_conf.h" #include +#include #include using namespace std; @@ -28,8 +29,8 @@ Config::Config( ) , octaves( -1 ) , levels( 3 ) , sigma( 1.6f ) - , _edge_limit( 10.0f ) - , _threshold( 0.04 ) // ( 10.0f / 256.0f ) + , _edge_limit( getEdgeThreshDefault( ) ) + , _threshold( getPeakThreshDefault() ) , _gauss_mode( getGaussModeDefault() ) , _sift_mode( Config::PopSift ) , _log_mode( Config::None ) @@ -285,9 +286,38 @@ float Config::getSigma( ) const { return sigma; } void Config::setEdgeLimit( float v ) { _edge_limit = v; } float Config::getEdgeLimit( ) const { return _edge_limit; } +float Config::getEdgeThreshDefault( ) +{ + return 10.0f; +} +std::string Config::getEdgeThreshUsage( ) +{ + std::ostringstream ostr; + ostr << "Edge Threshold: eliminates peaks of the DoG scale space whose curvature is too small." << endl + << "Default: " << getEdgeThreshDefault() << endl + << "Set to a value <= 0 to disable." << endl; + return ostr.str(); +} void Config::setThreshold( float v ) { _threshold = v; } float Config::getThreshold( ) const { return _threshold; } +float Config::getPeakThreshold() const +{ + return ( _threshold * 0.5f * 255.0f / levels ); +} +float Config::getPeakThreshDefault( ) +{ + return 0.04f; // ( 10.0f / 256.0f ) +} +std::string Config::getPeakThreshUsage( ) +{ + std::ostringstream ostr; + ostr << "Peak Threshold: eliminates peaks of the DoG scale space that are too small" < Date: Fri, 19 Feb 2021 10:18:28 +0100 Subject: [PATCH 09/30] [popsift] use ManagedMemory for ExtremaCounters and buffers --- src/popsift/common/debug_macros.cu | 17 ++++ src/popsift/common/debug_macros.h | 10 ++ src/popsift/s_desc_grid.cu | 45 +++++++- src/popsift/s_desc_grid.h | 31 +----- src/popsift/s_desc_igrid.cu | 50 +++++++-- src/popsift/s_desc_igrid.h | 33 +----- src/popsift/s_desc_iloop.cu | 42 +++++++- src/popsift/s_desc_iloop.h | 28 +---- src/popsift/s_desc_loop.cu | 50 ++++++++- src/popsift/s_desc_loop.h | 36 +------ src/popsift/s_desc_normalize.h | 5 +- src/popsift/s_desc_notile.cu | 24 +++-- src/popsift/s_desc_notile.h | 2 +- src/popsift/s_desc_vlfeat.cu | 17 ++-- src/popsift/s_desc_vlfeat.h | 3 +- src/popsift/s_extrema.cu | 25 +++-- src/popsift/s_filtergrid.cu | 43 ++++++-- src/popsift/s_orientation.cu | 65 ++++++------ src/popsift/sift_desc.cu | 28 +++-- src/popsift/sift_pyramid.cu | 158 +++++++++++++---------------- src/popsift/sift_pyramid.h | 20 ++-- 21 files changed, 404 insertions(+), 328 deletions(-) diff --git a/src/popsift/common/debug_macros.cu b/src/popsift/common/debug_macros.cu index cf4cd735..69acaa5a 100755 --- a/src/popsift/common/debug_macros.cu +++ b/src/popsift/common/debug_macros.cu @@ -62,6 +62,23 @@ void malloc_hst( void** ptr, int sz, } } } +namespace popsift { namespace cuda { +void malloc_mgd( void** ptr, int sz, + const char* file, int line ) +{ + cudaError_t err; + err = cudaMallocManaged( ptr, sz ); + if( err != cudaSuccess ) { + std::cerr << file << ":" << line << std::endl + << " cudaMallocManaged failed: " << cudaGetErrorString(err) << std::endl; + exit( -__LINE__ ); + } +#ifdef DEBUG_INIT_DEVICE_ALLOCATIONS + memset( *ptr, 0, sz ); +#endif // NDEBUG +} +} } + namespace popsift { namespace cuda { void memcpy_async( void* dst, const void* src, size_t sz, cudaMemcpyKind type, cudaStream_t stream, diff --git a/src/popsift/common/debug_macros.h b/src/popsift/common/debug_macros.h index dada7d3b..72eda129 100755 --- a/src/popsift/common/debug_macros.h +++ b/src/popsift/common/debug_macros.h @@ -37,6 +37,9 @@ void malloc_dev( void** ptr, int sz, void malloc_hst( void** ptr, int sz, const char* file, int line ); +void malloc_mgd( void** ptr, int sz, + const char* file, int line ); + template T* malloc_devT( int num, const char* file, int line ) { void* ptr; @@ -51,6 +54,13 @@ T* malloc_hstT( int num, const char* file, int line ) { return (T*)ptr; } +template +T* malloc_mgdT( int num, const char* file, int line ) { + void* ptr; + malloc_mgd( &ptr, num*sizeof(T), file, line ); + return (T*)ptr; +} + void memcpy_sync( void* dst, const void* src, size_t sz, cudaMemcpyKind type, const char* file, size_t line ); diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu index 099c4709..8ecea81a 100644 --- a/src/popsift/s_desc_grid.cu +++ b/src/popsift/s_desc_grid.cu @@ -121,15 +121,20 @@ void ext_desc_grid_sub( const int ix, } } -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex) +/* + * We assume that this is started with + * block = 16,4,4 + * grid = nunmber of orientations + */ +__global__ void ext_desc_grid( ExtremaBuffers* buf, int octave, int ori_base_index, cudaTextureObject_t layer_tex) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; + const int o_offset = ori_base_index + blockIdx.x; const int ix = threadIdx.y; const int iy = threadIdx.z; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -143,3 +148,33 @@ __global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex) layer_tex ); } +namespace popsift +{ + +bool start_ext_desc_grid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 16; + block.y = 4; + block.z = 4; + + ext_desc_grid + <<>> + ( buf, + octave, + ct->ori_ps[octave], + oct_obj.getDataTexPoint( ) ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_grid.h b/src/popsift/s_desc_grid.h index c0919806..704ee2ad 100644 --- a/src/popsift/s_desc_grid.h +++ b/src/popsift/s_desc_grid.h @@ -12,39 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -/* - * We assume that this is started with - * block = 16,4,4 - * grid = nunmber of orientations - */ -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex); - namespace popsift { -inline static bool start_ext_desc_grid( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 16; - block.y = 4; - block.z = 4; - - ext_desc_grid - <<>> - ( octave, - oct_obj.getDataTexPoint( ) ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_grid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu index 9f77f12f..2d8119fb 100644 --- a/src/popsift/s_desc_igrid.cu +++ b/src/popsift/s_desc_igrid.cu @@ -13,6 +13,8 @@ #include +#define IGRID_NUMLINES 1 + using namespace popsift; __device__ static inline @@ -74,17 +76,23 @@ void ext_desc_igrid_sub( const float x, const float y, const int level, } } -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear) +/* + * We assume that this is started with + * block = 16,4,4 or with 32,4,4, depending on macros + * grid = nunmber of orientations + */ +__global__ void ext_desc_igrid( ExtremaBuffers* buf, const int ori_base_index, + cudaTextureObject_t texLinear) { - const int num = dct.ori_ct[octave]; + const int num = gridDim.x; const int offset = blockIdx.x * blockDim.z + threadIdx.z; - const int o_offset = dct.ori_ps[octave] + offset; + const int o_offset = ori_base_index + offset; if( offset >= num ) return; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; if( ext->sigma == 0 ) return; const float SBP = fabsf( DESC_MAGNIFY * ext->sigma ); @@ -104,3 +112,33 @@ __global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear) texLinear ); } +namespace popsift +{ + +bool start_ext_desc_igrid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 16; + block.y = 16; + block.z = IGRID_NUMLINES; + + ext_desc_igrid + <<>> + ( buf, + ct->ori_ps[octave], + oct_obj.getDataTexLinear( ).tex ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift + diff --git a/src/popsift/s_desc_igrid.h b/src/popsift/s_desc_igrid.h index 8980a4bc..963c8daa 100644 --- a/src/popsift/s_desc_igrid.h +++ b/src/popsift/s_desc_igrid.h @@ -11,40 +11,9 @@ #include "sift_octave.h" #include "sift_pyramid.h" -/* - * We assume that this is started with - * block = 16,4,4 or with 32,4,4, depending on macros - * grid = nunmber of orientations - */ -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear); - namespace popsift { -#define IGRID_NUMLINES 1 - -inline static bool start_ext_desc_igrid( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 16; - block.y = 16; - block.z = IGRID_NUMLINES; - - ext_desc_igrid - <<>> - ( octave, - oct_obj.getDataTexLinear( ).tex ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_igrid( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu index 84673a20..a7e67252 100644 --- a/src/popsift/s_desc_iloop.cu +++ b/src/popsift/s_desc_iloop.cu @@ -128,12 +128,12 @@ void ext_desc_iloop_sub( const float ang, } } -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_iloop( ExtremaBuffers* buf, int octave, const int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -147,3 +147,35 @@ __global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, h ); } +namespace popsift +{ + +bool start_ext_desc_iloop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 32; + block.y = 1; + block.z = 16; + + ext_desc_iloop + <<>> + ( buf, + octave, + ct->ori_ps[octave], + oct_obj.getDataTexLinear( ).tex, + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_iloop.h b/src/popsift/s_desc_iloop.h index e69409b6..ac86b9e1 100644 --- a/src/popsift/s_desc_iloop.h +++ b/src/popsift/s_desc_iloop.h @@ -12,36 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int width, int height); - namespace popsift { -inline static bool start_ext_desc_iloop( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - - block.x = 32; - block.y = 1; - block.z = 16; - - ext_desc_iloop - <<>> - ( octave, - oct_obj.getDataTexLinear( ).tex, - oct_obj.getWidth(), - oct_obj.getHeight() ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_iloop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu index 4c5f46c2..f57cdabe 100644 --- a/src/popsift/s_desc_loop.cu +++ b/src/popsift/s_desc_loop.cu @@ -13,6 +13,8 @@ #include +#undef BLOCK_3_DIMS + using namespace popsift; __device__ static inline @@ -138,12 +140,12 @@ void ext_desc_loop_sub( const float ang, } } -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_loop( ExtremaBuffers* buf, int octave, int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -157,3 +159,41 @@ __global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, h ); } +namespace popsift +{ + +bool start_ext_desc_loop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = ct->ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + +#ifndef BLOCK_3_DIMS + block.x = 32; + block.y = 4; + block.z = 4; +#else + block.x = 32; + block.y = 1; + block.z = 16; +#endif + + ext_desc_loop + <<>> + ( buf, + octave, + ct->ori_ps[octave], + oct_obj.getDataTexPoint( ), + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_loop.h b/src/popsift/s_desc_loop.h index 600db498..3c562b45 100644 --- a/src/popsift/s_desc_loop.h +++ b/src/popsift/s_desc_loop.h @@ -12,44 +12,10 @@ #include "sift_octave.h" #include "sift_pyramid.h" -#undef BLOCK_3_DIMS - -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int width, int height); - namespace popsift { -inline static bool start_ext_desc_loop( const int octave, Octave& oct_obj ) -{ - dim3 block; - dim3 grid; - grid.x = hct.ori_ct[octave]; - grid.y = 1; - grid.z = 1; - - if( grid.x == 0 ) return false; - -#ifndef BLOCK_3_DIMS - block.x = 32; - block.y = 4; - block.z = 4; -#else - block.x = 32; - block.y = 1; - block.z = 16; -#endif - - ext_desc_loop - <<>> - ( octave, - oct_obj.getDataTexPoint( ), - oct_obj.getWidth(), - oct_obj.getHeight() ); - - POP_SYNC_CHK; - - return true; -} +bool start_ext_desc_loop( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_normalize.h b/src/popsift/s_desc_normalize.h index da711026..5e18b7a4 100644 --- a/src/popsift/s_desc_normalize.h +++ b/src/popsift/s_desc_normalize.h @@ -13,11 +13,8 @@ template __global__ -void normalize_histogram( ) +void normalize_histogram( Descriptor* descs, const int num_orientations ) { - Descriptor* descs = dbuf.desc; - const int num_orientations = dct.ori_total; - int offset = blockIdx.x * blockDim.y + threadIdx.y; // all of these threads are useless diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu index a336898b..861d11aa 100644 --- a/src/popsift/s_desc_notile.cu +++ b/src/popsift/s_desc_notile.cu @@ -95,19 +95,19 @@ __global__ // __launch_bounds__(192) // 56/threads // no -- __launch_bounds__(128) // 63/thread // no -- no launch bound // 64/thread/thread -void ext_desc_notile( const int octave, +void ext_desc_notile( ExtremaBuffers* buf, + const int ori_count, + const int ori_base_index, cudaTextureObject_t texLinear ) { - const int num = dct.ori_ct[octave]; - const int offset = blockIdx.x * BLOCK_Z_NOTILE + threadIdx.z; - const int o_offset = dct.ori_ps[octave] + offset; - if( offset >= num ) return; + const int o_offset = ori_base_index + offset; + if( offset >= ori_count ) return; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; if( ext->sigma == 0 ) return; const float SBP = fabsf( DESC_MAGNIFY * ext->sigma ); @@ -130,7 +130,7 @@ void ext_desc_notile( const int octave, namespace popsift { -bool start_ext_desc_notile( int octave, Octave& oct_obj ) +bool start_ext_desc_notile( const ExtremaCounters* ct, ExtremaBuffers* buf, int octave, Octave& oct_obj ) { dim3 block; dim3 grid; @@ -139,7 +139,7 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj ) block.y = 4; block.z = BLOCK_Z_NOTILE; - grid.x = grid_divide( hct.ori_ct[octave], block.z ); + grid.x = grid_divide( ct->ori_ct[octave], block.z ); grid.y = 1; grid.z = 1; @@ -147,7 +147,9 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj ) ext_desc_notile <<>> - ( octave, + ( buf, + ct->ori_ct[octave], + ct->ori_ps[octave], oct_obj.getDataTexLinear( ).tex ); cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError( ); diff --git a/src/popsift/s_desc_notile.h b/src/popsift/s_desc_notile.h index 93a91cc3..2474f63c 100644 --- a/src/popsift/s_desc_notile.h +++ b/src/popsift/s_desc_notile.h @@ -14,6 +14,6 @@ namespace popsift { -bool start_ext_desc_notile( int octave, Octave& oct_obj ); +bool start_ext_desc_notile( const ExtremaCounters* ct, ExtremaBuffers* buf, int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 375fb7eb..6aff7e48 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -161,12 +161,12 @@ void ext_desc_vlfeat_sub( const float ang, } } -__global__ void ext_desc_vlfeat(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_vlfeat( ExtremaBuffers* buf, const int ori_base_index, cudaTextureObject_t layer_tex, int w, int h) { - const int o_offset = dct.ori_ps[octave] + blockIdx.x; - Descriptor* desc = &dbuf.desc [o_offset]; - const int ext_idx = dobuf.feat_to_ext_map[o_offset]; - Extremum* ext = dobuf.extrema + ext_idx; + const int o_offset = ori_base_index + blockIdx.x; + Descriptor* desc = &buf->desc[o_offset]; + const int ext_idx = buf->feat_to_ext_map[o_offset]; + Extremum* ext = buf->extrema + ext_idx; const int ext_base = ext->idx_ori; const int ori_num = o_offset - ext_base; @@ -183,11 +183,11 @@ __global__ void ext_desc_vlfeat(int octave, cudaTextureObject_t layer_tex, int w namespace popsift { -bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) +bool start_ext_desc_vlfeat( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ) { dim3 block; dim3 grid; - grid.x = hct.ori_ct[octave]; + grid.x = ct->ori_ct[octave]; grid.y = 1; grid.z = 1; @@ -201,7 +201,8 @@ bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) ext_desc_vlfeat <<>> - ( octave, + ( buf, + ct->ori_ps[octave], oct_obj.getDataTexPoint( ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_desc_vlfeat.h b/src/popsift/s_desc_vlfeat.h index 4370bc7b..2dca9c04 100644 --- a/src/popsift/s_desc_vlfeat.h +++ b/src/popsift/s_desc_vlfeat.h @@ -8,11 +8,12 @@ */ #pragma once #include "sift_octave.h" +#include "sift_pyramid.h" namespace popsift { -bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ); +bool start_ext_desc_vlfeat( const ExtremaCounters* ct, ExtremaBuffers* buf, const int octave, Octave& oct_obj ); }; // namespace popsift diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index 27d8c15b..b8329cf8 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -517,6 +517,8 @@ void find_extrema_in_dog( cudaTextureObject_t dog, int width, int height, const uint32_t maxlevel, + ExtremaCounters* ct, + ExtremaBuffers* buf, int* d_number_of_blocks, int number_of_blocks, const float w_grid_divider, @@ -536,10 +538,10 @@ void find_extrema_in_dog( cudaTextureObject_t dog, grid_width, ec ); - uint32_t write_index = extrema_count( indicator, &dct.ext_ct[octave] ); + uint32_t write_index = extrema_count( indicator, &ct->ext_ct[octave] ); - InitialExtremum* d_extrema = dobuf.i_ext_dat[octave]; - int* d_ext_off = dobuf.i_ext_off[octave]; + InitialExtremum* d_extrema = buf->i_ext_dat[octave]; + int* d_ext_off = buf->i_ext_off[octave]; if( indicator && write_index < d_consts.max_extrema ) { ec.write_index = write_index; @@ -556,10 +558,13 @@ void find_extrema_in_dog( cudaTextureObject_t dog, __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { - int ct = atomicAdd( d_number_of_blocks, 1 ); - if( ct >= number_of_blocks-1 ) { - int num_ext = atomicMin( &dct.ext_ct[octave], d_consts.max_extrema ); - // printf( "Block %d,%d,%d num ext %d\n", blockIdx.x, blockIdx.y, blockIdx.z, dct.ext_ct[octave] ); + // This works as a global barrier. Only thread 0 of the last thread block + // that finishes will call the atomicMin function. + // The atomicMin enforces an upper limit on the octave's number of extrema. + int count = atomicAdd( d_number_of_blocks, 1 ); + if( count >= number_of_blocks-1 ) { + int num_ext = atomicMin( &ct->ext_ct[octave], d_consts.max_extrema ); + // printf( "Block %d,%d,%d num ext %d\n", blockIdx.x, blockIdx.y, blockIdx.z, ct->ext_ct[octave] ); } } } @@ -602,6 +607,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), @@ -617,6 +624,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), @@ -632,6 +641,8 @@ void Pyramid::find_extrema( const Config& conf ) cols, rows, _levels-1, + _ct, + _buf, num_blocks, grid.x * grid.y, oct_obj.getWGridDivider(), diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu index 078eb114..08b60431 100644 --- a/src/popsift/s_filtergrid.cu +++ b/src/popsift/s_filtergrid.cu @@ -52,6 +52,13 @@ struct FunctionSort_IncCell_IncScale struct FunctionExtractCell { + ExtremaBuffers* _buf; + + FunctionExtractCell( ExtremaBuffers* buf ) + { + _buf = buf; + } + __device__ inline thrust::tuple operator()( const thrust::tuple& val) const { @@ -61,7 +68,7 @@ struct FunctionExtractCell */ const int octave = thrust::get<0>(val); const int idx = thrust::get<1>(val); - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; + InitialExtremum& e = _buf->i_ext_dat[octave][idx]; return thrust::make_tuple( e.cell, e.sigma * powf( 2.0f, octave ) ); } @@ -81,22 +88,36 @@ struct FunctionIsAbove struct FunctionDisableExtremum { + ExtremaBuffers* _buf; + + FunctionDisableExtremum( ExtremaBuffers* buf ) + { + _buf = buf; + } + __device__ inline void operator()( const thrust::tuple& val) const { const int octave = thrust::get<0>(val); const int idx = thrust::get<1>(val); - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; + InitialExtremum& e = _buf->i_ext_dat[octave][idx]; e.ignore = true; } }; struct FunctionExtractIgnored { + ExtremaBuffers* _buf; + + FunctionExtractIgnored( ExtremaBuffers* buf ) + { + _buf = buf; + } + __device__ inline int operator()( int idx, int octave ) const { - InitialExtremum& e = dobuf.i_ext_dat[octave][idx]; + InitialExtremum& e = _buf->i_ext_dat[octave][idx]; if( e.ignore ) return 0; else @@ -127,7 +148,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) int sum = 0; for( int o=0; oext_ct[o]; if( ocount > 0 ) { cudaStream_t oct_str = _octaves[o].getStream(); @@ -147,7 +168,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) cudaDeviceSynchronize(); // extract cell and scale value for all initial extrema - FunctionExtractCell fun_extract_cell; + FunctionExtractCell fun_extract_cell( _buf ); thrust::transform( thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), iext_index.begin() ) ), @@ -267,7 +288,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) for( int i=0; iext_ct[o]; if( ocount > 0 ) { - FunctionExtractIgnored fun_extract_ignore; + FunctionExtractIgnored fun_extract_ignore( _buf ); thrust::identity fun_id; grid.resize( ocount ); @@ -300,7 +321,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) grid.begin(), fun_extract_ignore ); - thrust::device_ptr off_ptr = thrust::device_pointer_cast( dobuf_shadow.i_ext_off[o] ); + thrust::device_ptr off_ptr = thrust::device_pointer_cast( _buf->i_ext_off[o] ); thrust::copy_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(ocount), @@ -308,9 +329,9 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) off_ptr, fun_id ); - hct.ext_ct[o] = thrust::reduce( grid.begin(), grid.end() ); + _ct->ext_ct[o] = thrust::reduce( grid.begin(), grid.end() ); - ret_ext_total += hct.ext_ct[o]; + ret_ext_total += _ct->ext_ct[o]; } } diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index 96d43049..cfbef679 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -52,17 +52,19 @@ float smoothe( const float* const src, const int bin ) */ __global__ void ori_par( const int octave, - const int ext_ct_prefix_sum, + ExtremaCounters* ct, + ExtremaBuffers* buf, cudaTextureObject_t layer, const int w, const int h ) { - const int extremum_index = blockIdx.x * blockDim.y; + const int& ext_ct_prefix_sum = ct->ext_ps[octave]; + const int extremum_index = blockIdx.x * blockDim.y; - if( popsift::all( extremum_index >= dct.ext_ct[octave] ) ) return; // a few trailing warps + if( popsift::all( extremum_index >= ct->ext_ct[octave] ) ) return; // a few trailing warps - const int iext_off = dobuf.i_ext_off[octave][extremum_index]; - const InitialExtremum* iext = &dobuf.i_ext_dat[octave][iext_off]; + const int iext_off = buf->i_ext_off[octave][extremum_index]; + const InitialExtremum* iext = &buf->i_ext_dat[octave][iext_off]; __shared__ float hist [64]; __shared__ float sm_hist [64]; @@ -196,7 +198,7 @@ void ori_par( const int octave, const bool valid = ( yval[best_index.x] > 0 ); bool written = false; - Extremum* ext = &dobuf.extrema[ext_ct_prefix_sum + extremum_index]; + Extremum* ext = &buf->extrema[ext_ct_prefix_sum + extremum_index]; if( threadIdx.x < ORIENTATION_MAX_COUNT ) { if( valid ) { @@ -282,46 +284,46 @@ public: }; __global__ -void ori_prefix_sum( const int total_ext_ct, const int num_octaves ) +void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf, const int total_ext_ct, const int num_octaves ) { int total_ori = 0; - Extremum* extremum = dobuf.extrema; - int* feat_to_ext_map = dobuf.feat_to_ext_map; + Extremum* extremum = buf->extrema; + int* feat_to_ext_map = buf->feat_to_ext_map; ExtremaRead r( extremum ); ExtremaWrt w( extremum ); ExtremaTot t( total_ori ); - ExtremaWrtMap wrtm( feat_to_ext_map, max( d_consts.max_orientations, dbuf.ori_allocated ) ); + ExtremaWrtMap wrtm( feat_to_ext_map, max( d_consts.max_orientations, buf->ori_allocated ) ); ExclusivePrefixSum::Block( total_ext_ct, r, w, t, wrtm ); __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { - dct.ext_ps[0] = 0; + ct->ext_ps[0] = 0; for( int o=1; oext_ps[o] = ct->ext_ps[o-1] + ct->ext_ct[o-1]; } for( int o=0; oext_ct[o] == 0 ) { + ct->ori_ct[o] = 0; } else { - int fe = dct.ext_ps[o ]; /* first extremum for this octave */ - int le = dct.ext_ps[o+1]-1; /* last extremum for this octave */ - int lo_ori_index = dobuf.extrema[fe].idx_ori; - int num_ori = dobuf.extrema[le].num_ori; - int hi_ori_index = dobuf.extrema[le].idx_ori + num_ori; - dct.ori_ct[o] = hi_ori_index - lo_ori_index; + int fe = ct->ext_ps[o ]; /* first extremum for this octave */ + int le = ct->ext_ps[o+1]-1; /* last extremum for this octave */ + int lo_ori_index = buf->extrema[fe].idx_ori; + int num_ori = buf->extrema[le].num_ori; + int hi_ori_index = buf->extrema[le].idx_ori + num_ori; + ct->ori_ct[o] = hi_ori_index - lo_ori_index; } } - dct.ori_ps[0] = 0; + ct->ori_ps[0] = 0; for( int o=1; oori_ps[o] = ct->ori_ps[o-1] + ct->ori_ct[o-1]; } - dct.ori_total = dct.ori_ps[MAX_OCTAVES-1] + dct.ori_ct[MAX_OCTAVES-1]; - dct.ext_total = dct.ext_ps[MAX_OCTAVES-1] + dct.ext_ct[MAX_OCTAVES-1]; + ct->ori_total = ct->ori_ps[MAX_OCTAVES-1] + ct->ori_ct[MAX_OCTAVES-1]; + ct->ext_total = ct->ext_ps[MAX_OCTAVES-1] + ct->ext_ct[MAX_OCTAVES-1]; } } @@ -331,7 +333,7 @@ void Pyramid::orientation( const Config& conf ) readDescCountersFromDevice( ); int ext_total = 0; - for(int o : hct.ext_ct) + for(int o : _ct->ext_ct) { if( o > 0 ) { @@ -350,10 +352,10 @@ void Pyramid::orientation( const Config& conf ) int ext_ct_prefix_sum = 0; for( int octave=0; octave<_num_octaves; octave++ ) { - hct.ext_ps[octave] = ext_ct_prefix_sum; - ext_ct_prefix_sum += hct.ext_ct[octave]; + _ct->ext_ps[octave] = ext_ct_prefix_sum; + ext_ct_prefix_sum += _ct->ext_ct[octave]; } - hct.ext_total = ext_ct_prefix_sum; + _ct->ext_total = ext_ct_prefix_sum; cudaStream_t oct_0_str = _octaves[0].getStream(); @@ -364,7 +366,7 @@ void Pyramid::orientation( const Config& conf ) cudaStream_t oct_str = oct_obj.getStream(); - int num = hct.ext_ct[octave]; + int num = _ct->ext_ct[octave]; if( num > 0 ) { dim3 block; @@ -377,7 +379,8 @@ void Pyramid::orientation( const Config& conf ) ori_par <<>> ( octave, - hct.ext_ps[octave], + _ct, + _buf, oct_obj.getDataTexPoint( ), oct_obj.getWidth( ), oct_obj.getHeight( ) ); @@ -398,7 +401,7 @@ void Pyramid::orientation( const Config& conf ) grid.x = 1; ori_prefix_sum <<>> - ( ext_ct_prefix_sum, _num_octaves ); + ( _ct, _buf, ext_ct_prefix_sum, _num_octaves ); POP_SYNC_CHK; cudaDeviceSynchronize(); diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index f12f85fc..5fb81c89 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -65,21 +65,21 @@ void Pyramid::descriptors( const Config& conf ) for( int octave=_num_octaves-1; octave>=0; octave-- ) // for( int octave=0; octave<_num_octaves; octave++ ) { - if( hct.ori_ct[octave] != 0 ) { + if( _ct->ori_ct[octave] != 0 ) { Octave& oct_obj = _octaves[octave]; if( conf.getDescMode() == Config::Loop ) { - start_ext_desc_loop( octave, oct_obj ); + start_ext_desc_loop( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::ILoop ) { - start_ext_desc_iloop( octave, oct_obj ); + start_ext_desc_iloop( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::Grid ) { - start_ext_desc_grid( octave, oct_obj ); + start_ext_desc_grid( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::IGrid ) { - start_ext_desc_igrid( octave, oct_obj ); + start_ext_desc_igrid( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::NoTile ) { - start_ext_desc_notile( octave, oct_obj ); + start_ext_desc_notile( _ct, _buf, octave, oct_obj ); } else if( conf.getDescMode() == Config::VLFeat_Desc ) { - start_ext_desc_vlfeat( octave, oct_obj ); + start_ext_desc_vlfeat( _ct, _buf, octave, oct_obj ); } else { POP_FATAL( "not yet" ); } @@ -88,7 +88,7 @@ void Pyramid::descriptors( const Config& conf ) } } - if( hct.ori_total == 0 ) + if( _ct->ori_total == 0 ) { cerr << "Warning: no descriptors extracted" << endl; return; @@ -100,13 +100,19 @@ void Pyramid::descriptors( const Config& conf ) block.z = 1; dim3 grid; - grid.x = grid_divide( hct.ori_total, block.y ); + grid.x = grid_divide( _ct->ori_total, block.y ); if( conf.getUseRootSift() ) { - normalize_histogram <<>> ( ); + normalize_histogram + <<>> + ( _buf->desc, + _ct->ori_total ); POP_SYNC_CHK; } else { - normalize_histogram <<>> ( ); + normalize_histogram + <<>> + ( _buf->desc, + _ct->ori_total ); POP_SYNC_CHK; } diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index 06060052..05d7d1c8 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -38,16 +38,6 @@ using namespace std; namespace popsift { -__device__ ExtremaCounters dct; -thread_local ExtremaCounters hct; - -__device__ ExtremaBuffers dbuf; -thread_local ExtremaBuffers dbuf_shadow; // just for managing memories -thread_local ExtremaBuffers hbuf; - -__device__ DevBuffers dobuf; -thread_local DevBuffers dobuf_shadow; // just for managing memories - __global__ void py_print_corner_float(float* img, uint32_t pitch, uint32_t height, uint32_t level) { @@ -118,11 +108,14 @@ Pyramid::Pyramid( const Config& config, int w = width; int h = height; - memset( &hct, 0, sizeof(ExtremaCounters) ); - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + // memset( &hct, 0, sizeof(ExtremaCounters) ); + // cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + + cudaMallocManaged( &_ct, sizeof(ExtremaCounters) ); + memset( _ct, 0, sizeof(ExtremaCounters) ); - memset( &hbuf, 0, sizeof(ExtremaBuffers) ); - memset( &dbuf_shadow, 0, sizeof(ExtremaBuffers) ); + cudaMallocManaged( &_buf, sizeof(ExtremaBuffers) ); + memset( _buf, 0, sizeof(ExtremaBuffers) ); _d_extrema_num_blocks = popsift::cuda::malloc_devT( _num_octaves, __FILE__, __LINE__ ); @@ -134,32 +127,27 @@ Pyramid::Pyramid( const Config& config, } int sz = _num_octaves * h_consts.max_extrema; - dobuf_shadow.i_ext_dat[0] = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.i_ext_off[0] = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); + _buf->i_ext_dat[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->i_ext_off[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); for (int o = 1; o<_num_octaves; o++) { - dobuf_shadow.i_ext_dat[o] = dobuf_shadow.i_ext_dat[0] + (o*h_consts.max_extrema); - dobuf_shadow.i_ext_off[o] = dobuf_shadow.i_ext_off[0] + (o*h_consts.max_extrema); + _buf->i_ext_dat[o] = _buf->i_ext_dat[0] + (o*h_consts.max_extrema); + _buf->i_ext_off[o] = _buf->i_ext_off[0] + (o*h_consts.max_extrema); } for (int o = _num_octaves; oi_ext_dat[o] = nullptr; + _buf->i_ext_off[o] = nullptr; } sz = h_consts.max_extrema; - dobuf_shadow.extrema = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.features = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ext_allocated = sz; - dbuf_shadow.ext_allocated = sz; + _buf->extrema = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->features = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - sz = max( 2 * h_consts.max_extrema, h_consts.max_orientations ); - hbuf .desc = popsift::cuda::malloc_hstT( sz, __FILE__, __LINE__); - dbuf_shadow.desc = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.feat_to_ext_map = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ori_allocated = sz; - dbuf_shadow.ori_allocated = sz; + _buf->ext_allocated = sz; - cudaMemcpyToSymbol( dbuf, &dbuf_shadow, sizeof(ExtremaBuffers), 0, cudaMemcpyHostToDevice ); - cudaMemcpyToSymbol( dobuf, &dobuf_shadow, sizeof(DevBuffers), 0, cudaMemcpyHostToDevice ); + sz = max( 2 * h_consts.max_extrema, h_consts.max_orientations ); + _buf->desc = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->feat_to_ext_map = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->ori_allocated = sz; cudaStreamCreate( &_download_stream ); } @@ -178,33 +166,26 @@ void Pyramid::resetDimensions( const Config& conf, int width, int height ) void Pyramid::reallocExtrema( int numExtrema ) { - if( numExtrema > hbuf.ext_allocated ) { + if( numExtrema > _buf->ext_allocated ) { numExtrema = ( ( numExtrema + 1024 ) & ( ~(1024-1) ) ); - cudaFree( dobuf_shadow.extrema ); - cudaFree( dobuf_shadow.features ); + cudaFree( _buf->extrema ); + cudaFree( _buf->features ); int sz = numExtrema; - dobuf_shadow.extrema = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.features = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ext_allocated = sz; - dbuf_shadow.ext_allocated = sz; + _buf->extrema = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->features = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->ext_allocated = sz; numExtrema *= 2; - if( numExtrema > hbuf.ori_allocated ) { - cudaFreeHost( hbuf .desc ); - cudaFree( dbuf_shadow.desc ); - cudaFree( dobuf_shadow.feat_to_ext_map ); + if( numExtrema > _buf->ori_allocated ) { + cudaFree( _buf->desc ); + cudaFree( _buf->feat_to_ext_map ); sz = numExtrema; - hbuf .desc = popsift::cuda::malloc_hstT( sz, __FILE__, __LINE__); - dbuf_shadow.desc = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - dobuf_shadow.feat_to_ext_map = popsift::cuda::malloc_devT( sz, __FILE__, __LINE__); - hbuf .ori_allocated = sz; - dbuf_shadow.ori_allocated = sz; + _buf->desc = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->feat_to_ext_map = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + _buf->ori_allocated = sz; } - - cudaMemcpyToSymbol( dbuf, &dbuf_shadow, sizeof(ExtremaBuffers), 0, cudaMemcpyHostToDevice ); - cudaMemcpyToSymbol( dobuf, &dobuf_shadow, sizeof(DevBuffers), 0, cudaMemcpyHostToDevice ); } } @@ -213,13 +194,15 @@ Pyramid::~Pyramid() cudaStreamDestroy( _download_stream ); cudaFree( _d_extrema_num_blocks ); - cudaFree( dobuf_shadow.i_ext_dat[0] ); - cudaFree( dobuf_shadow.i_ext_off[0] ); - cudaFree( dobuf_shadow.features ); - cudaFree( dobuf_shadow.extrema ); - cudaFreeHost( hbuf .desc ); - cudaFree( dbuf_shadow .desc ); - cudaFree( dobuf_shadow.feat_to_ext_map ); + cudaFree( _buf->i_ext_dat[0] ); + cudaFree( _buf->i_ext_off[0] ); + cudaFree( _buf->features ); + cudaFree( _buf->extrema ); + cudaFree( _buf->desc ); + cudaFree( _buf->feat_to_ext_map ); + + cudaFree( _ct ); + cudaFree( _buf ); delete[] _octaves; } @@ -248,12 +231,12 @@ void Pyramid::step2( const Config& conf ) * GPUs are compatible. */ __global__ -void prep_features( Descriptor* descriptor_base, int up_fac ) +void prep_features( Descriptor* descriptor_base, ExtremaCounters* ct, ExtremaBuffers* buf, int up_fac ) { int offset = blockIdx.x * 32 + threadIdx.x; - if( offset >= dct.ext_total ) return; - const Extremum& ext = dobuf.extrema [offset]; - Feature& fet = dobuf.features[offset]; + if( offset >= ct->ext_total ) return; + const Extremum& ext = buf->extrema [offset]; + Feature& fet = buf->features[offset]; const int octave = ext.octave; const float xpos = ext.xpos * powf(2.0f, float(octave - up_fac)); @@ -286,30 +269,30 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) readDescCountersFromDevice(); nvtxRangePushA( "download descriptors" ); - FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total ); + FeaturesHost* features = new FeaturesHost( _ct->ext_total, _ct->ori_total ); - if( hct.ext_total == 0 || hct.ori_total == 0 ) + if( _ct->ext_total == 0 || _ct->ori_total == 0 ) { nvtxRangePop(); return features; } - dim3 grid( grid_divide( hct.ext_total, 32 ) ); - prep_features<<>>( features->getDescriptors(), up_fac ); + dim3 grid( grid_divide( _ct->ext_total, 32 ) ); + prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; nvtxRangePushA( "register host memory" ); features->pin( ); nvtxRangePop(); popcuda_memcpy_async( features->getFeatures(), - dobuf_shadow.features, - hct.ext_total * sizeof(Feature), + _buf->features, + _ct->ext_total * sizeof(Feature), cudaMemcpyDeviceToHost, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), - dbuf_shadow.desc, - hct.ori_total * sizeof(Descriptor), + _buf->desc, + _ct->ori_total * sizeof(Descriptor), cudaMemcpyDeviceToHost, _download_stream ); cudaStreamSynchronize( _download_stream ); @@ -325,25 +308,25 @@ void Pyramid::clone_device_descriptors_sub( const Config& conf, FeaturesDev* fea { const float up_fac = conf.getUpscaleFactor(); - dim3 grid( grid_divide( hct.ext_total, 32 ) ); - prep_features<<>>( features->getDescriptors(), up_fac ); + dim3 grid( grid_divide( _ct->ext_total, 32 ) ); + prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; popcuda_memcpy_async( features->getFeatures(), - dobuf_shadow.features, - hct.ext_total * sizeof(Feature), + _buf->features, + _ct->ext_total * sizeof(Feature), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), - dbuf_shadow.desc, - hct.ori_total * sizeof(Descriptor), + _buf->desc, + _ct->ori_total * sizeof(Descriptor), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getReverseMap(), - dobuf_shadow.feat_to_ext_map, - hct.ori_total * sizeof(int), + _buf->feat_to_ext_map, + _ct->ori_total * sizeof(int), cudaMemcpyDeviceToDevice, _download_stream ); } @@ -352,7 +335,7 @@ FeaturesDev* Pyramid::clone_device_descriptors( const Config& conf ) { readDescCountersFromDevice(); - FeaturesDev* features = new FeaturesDev( hct.ext_total, hct.ori_total ); + FeaturesDev* features = new FeaturesDev( _ct->ext_total, _ct->ori_total ); clone_device_descriptors_sub( conf, features ); @@ -363,8 +346,7 @@ FeaturesDev* Pyramid::clone_device_descriptors( const Config& conf ) void Pyramid::reset_extrema_mgmt() { - memset( &hct, 0, sizeof(ExtremaCounters) ); - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + memset( _ct, 0, sizeof(ExtremaCounters) ); popcuda_memset_sync( _d_extrema_num_blocks, 0, _num_octaves * sizeof(int) ); @@ -372,22 +354,24 @@ void Pyramid::reset_extrema_mgmt() void Pyramid::readDescCountersFromDevice( ) { - cudaMemcpyFromSymbol( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost ); + // cudaMemcpyFromSymbol( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost ); + // we must not copy mgmt memory explicitly, just wait for the device driver + cudaDeviceSynchronize(); } void Pyramid::readDescCountersFromDevice( cudaStream_t s ) { - cudaMemcpyFromSymbolAsync( &hct, dct, sizeof(ExtremaCounters), 0, cudaMemcpyDeviceToHost, s ); + cudaStreamSynchronize( s ); } void Pyramid::writeDescCountersToDevice( ) { - cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); + cudaDeviceSynchronize(); } void Pyramid::writeDescCountersToDevice( cudaStream_t s ) { - cudaMemcpyToSymbolAsync( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice, s ); + cudaStreamSynchronize( s ); } int* Pyramid::getNumberOfBlocks( int octave ) @@ -404,7 +388,7 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* const float up_fac = conf.getUpscaleFactor(); - for( int ext_idx = 0; ext_idxext_total; ext_idx++ ) { const Feature& ext = features->getFeatures()[ext_idx]; const int octave = ext.debug_octave; const float xpos = ext.xpos * pow(2.0f, octave - up_fac); @@ -417,7 +401,7 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* dom_ori = dom_ori / M_PI2 * 360; if (dom_ori < 0) dom_ori += 360; - const Descriptor& desc = *ext.desc[ori]; // hbuf.desc[ori_idx]; + const Descriptor& desc = *ext.desc[ori]; if( with_orientation ) ostr << setprecision(5) diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 837fc3b1..169c7a76 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -36,13 +36,16 @@ struct ExtremaCounters struct ExtremaBuffers { + /* This part of the struct deals with the descriptors that are + * finally detected by the algorithm. + */ Descriptor* desc; int ext_allocated; int ori_allocated; -}; -struct DevBuffers -{ + /* This part of the struct deals with intermediate buffers to find + * extrema. + */ InitialExtremum* i_ext_dat[MAX_OCTAVES]; int* i_ext_off[MAX_OCTAVES]; int* feat_to_ext_map; @@ -50,14 +53,6 @@ struct DevBuffers Feature* features; }; -extern thread_local ExtremaCounters hct; -extern __device__ ExtremaCounters dct; -extern thread_local ExtremaBuffers hbuf; -extern __device__ ExtremaBuffers dbuf; -extern thread_local ExtremaBuffers dbuf_shadow; // just for managing memories -extern __device__ DevBuffers dobuf; -extern thread_local DevBuffers dobuf_shadow; // just for managing memories - class Pyramid { int _num_octaves; @@ -76,6 +71,9 @@ class Pyramid /* the download of converted descriptors should be asynchronous */ cudaStream_t _download_stream; + ExtremaCounters* _ct; + ExtremaBuffers* _buf; + public: enum GaussTableChoice { Interpolated_FromPrevious, From 9a21d430cbd229c6c81da10adb178f2bce37c53a Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sat, 20 Feb 2021 12:27:55 +0100 Subject: [PATCH 10/30] [popsift] simplify grid filtering with managed mem --- src/CMakeLists.txt | 2 +- src/popsift/filtergrid.cu | 353 ++++++++++++++++++++++++++++++++++ src/popsift/filtergrid.h | 104 ++++++++++ src/popsift/s_extrema.cu | 5 +- src/popsift/s_filtergrid.cu | 359 ----------------------------------- src/popsift/s_orientation.cu | 5 +- src/popsift/sift_conf.cu | 31 ++- src/popsift/sift_conf.h | 79 ++++---- src/popsift/sift_extremum.h | 10 +- src/popsift/sift_pyramid.cu | 122 ++++++------ src/popsift/sift_pyramid.h | 45 ++++- 11 files changed, 647 insertions(+), 468 deletions(-) create mode 100755 src/popsift/filtergrid.cu create mode 100755 src/popsift/filtergrid.h delete mode 100644 src/popsift/s_filtergrid.cu diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index db7aeadc..fd4979ec 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,7 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_constants.cu popsift/sift_constants.h popsift/sift_conf.cu popsift/sift_conf.h popsift/gauss_filter.cu popsift/gauss_filter.h + popsift/filtergrid.cu popsift/filtergrid.h popsift/s_image.cu popsift/s_image.h popsift/sift_pyramid.cu popsift/sift_pyramid.h popsift/sift_octave.cu popsift/sift_octave.h @@ -16,7 +17,6 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_extremum.h popsift/sift_extremum.cu popsift/s_extrema.cu popsift/s_orientation.cu - popsift/s_filtergrid.cu popsift/sift_desc.cu popsift/s_desc_loop.cu popsift/s_desc_loop.h popsift/s_desc_iloop.cu popsift/s_desc_iloop.h diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu new file mode 100755 index 00000000..6b4435a8 --- /dev/null +++ b/src/popsift/filtergrid.cu @@ -0,0 +1,353 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "filtergrid.h" + +#include "sift_config.h" +#include "sift_extremum.h" +#include "sift_pyramid.h" +#include "common/debug_macros.h" +#include "common/assist.h" + +#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) + +#include +#include + +namespace popsift +{ + +FilterGrid::FilterGrid( const Config& conf ) + : _slots( conf.getFilterGridSize() * conf.getFilterGridSize() ) + , _alloc_size( 0 ) + , _sorting_index( 0 ) + , _cells( 0 ) + , _scales( 0 ) + , _initial_extrema_pointers( 0 ) +{ + if( _slots > 1 ) + { + /* refresh the exclsuive prefix sum over the per-octave extrema counts */ + _histogram = popsift::cuda::malloc_mgdT( MAX_OCTAVES*_slots, __FILE__, __LINE__); + _histogram_full = popsift::cuda::malloc_mgdT( _slots, __FILE__, __LINE__); + _histogram_eps = popsift::cuda::malloc_mgdT( _slots+1, __FILE__, __LINE__); + + // initialize histogram to zero, implicitly move to device + popcuda_memset_sync( _histogram, 0, MAX_OCTAVES * _slots * sizeof(int) ); + popcuda_memset_sync( _histogram_full, 0, _slots * sizeof(int) ); + } +} + +FilterGrid::~FilterGrid( ) +{ + if( _slots > 1 ) + { + cudaFree( _histogram_eps ); + cudaFree( _histogram_full ); + cudaFree( _histogram ); + cudaFree( _sorting_index ); + cudaFree( _cells ); + cudaFree( _scales ); + cudaFree( _initial_extrema_pointers ); + } +} + +__global__ void +fg_init( const int octave, + ExtremaCounters* ct, + ExtremaBuffers* buf, + int* indices, + int* cells, + float* scales, + InitialExtremum** ext_compact ) +{ + const int my_idx_in_octave = blockIdx.x * blockDim.x + threadIdx.x; + const int extrema_in_octave = ct->ext_ct[octave]; + + if( my_idx_in_octave > extrema_in_octave ) return; + + const int base_index = ct->ext_ps[octave]; + const int my_idx = base_index + my_idx_in_octave; + + /* This is the array we are going to use for sorting and filtering and so on. */ + indices[my_idx] = my_idx; + + InitialExtremum* ie = &buf->i_ext_dat[octave][my_idx_in_octave]; + + cells[my_idx] = ie->cell; + scales[my_idx] = ie->scale; + + /* Keep pointers in this pointer array. The indirection is not elegant for CUDA, + * we can improve that later by making additional arrays for .cell and .scale + * values. + */ + ext_compact[my_idx] = ie; +} + +void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) +{ + _buf = buf; + _ct = ct; + _ct->make_extrema_prefix_sums(); + + + if( _alloc_size < _ct->ext_total ) + { + if( _alloc_size > 0 ) + { + cudaFree( _sorting_index ); + cudaFree( _cells ); + cudaFree( _scales ); + cudaFree( _initial_extrema_pointers ); + } + _alloc_size = _ct->ext_total; + _sorting_index = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _cells = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _scales = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + _initial_extrema_pointers = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); + } + + popcuda_memset_sync( _histogram_full, 0, _slots * sizeof(int) ); + + for( int o=0; oext_ct[o] == 0 ) continue; + + dim3 block( 32 ); + dim3 grid( grid_divide( _ct->ext_ct[o], block.x ) ); + fg_init + <<>> + ( o, + _ct, + _buf, + _sorting_index, + _cells, + _scales, + _initial_extrema_pointers ); + } +} + +__global__ void +fg_countcells( const int ext_total, + const int* cells, + const int cell_histogram_size, + int* cell_histogram ) +{ + // The size of this chared memory region is computed from _slots in the + // calling host code. + extern __shared__ int cellcounts[]; + + const int tid = threadIdx.x; + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if( idx >= ext_total ) return; + + const int cell = cells[idx]; + + atomicAdd( &cellcounts[cell], 1 ); + __syncthreads(); + + for( int c=tid; cext_total, block.x ) ); + + fg_countcells + <<>> + ( _ct->ext_total, + _cells, + _slots, + _histogram_full ); +} + +void FilterGrid::make_histogram_prefix_sum( ) +{ + // We could to this on the device ! + cudaDeviceSynchronize(); + + _histogram_eps[0] = 0; + + for( int i=1; i<=_slots; i++ ) + { + _histogram_eps[i] = _histogram_eps[i-1] + _histogram_full[i-1]; + } +} + + +class CellScaleCompare +{ + int* _cells; + float* _scales; + +public: + CellScaleCompare( int* cell, float* scales ) + : _cells( cell ) + , _scales( scales ) + { } + + __host__ __device__ + inline bool operator()( int left, int right ) const + { + if( _cells[left] < _cells[right] ) return true; + if( ( _cells[left] == _cells[right] ) && + ( _scales[left] < _scales[right] ) ) return true; + return false; + } +}; + +void FilterGrid::sort_by_cell_and_scale( ) +{ + /* Thanks to managed memory, we can sort our indeces either on the + * host or on the device. Currently, we must use thrust for compiling + * for the device. With the HPC SDK, we will be able to use OpenACC + * and never know the difference. + */ + CellScaleCompare tc( _cells, _scales ); +#if 0 + cudaDeviceSynchronize(); + int* ptr = _sorting_index; + std::sort( ptr, ptr + _ct->ext_total, tc ); +#else + thrust::device_ptr ptr = thrust::device_pointer_cast( _sorting_index ); + thrust::sort( ptr, ptr + _ct->ext_total, tc ); +#endif +} + +void FilterGrid::level_histogram( const int max_extrema ) +{ + int avg_in_cell = max_extrema / _slots; + + bool stable; + do + { + stable = true; + + int limited_slots = 0; + int sum = 0; + + for( int i=0; i<_slots; i++ ) + { + if( _histogram_full[i] > avg_in_cell ) + { + sum += avg_in_cell; + limited_slots += 1; + } + else + { + sum += _histogram_full[i]; + } + } + + const int diff = max_extrema - sum; + + if( limited_slots > 0 && diff > 0 ) + { + stable = false; + const std::div_t res = std::div( diff, limited_slots ); + const int widen = res.quot + ( res.rem > 0 ? 1 : 0 ); + + + avg_in_cell += widen; + } + } + while( !stable ); + + + for( int i=0; i<_slots; i++ ) + { + _histogram_full[i] = min( _histogram_full[i], avg_in_cell ); + } +} + +void FilterGrid::prune_extrema( GridFilterConfig::Mode mode ) +{ + for( int cell=0; cell<_slots; cell++ ) + { + const int base = _histogram_eps[cell]; + const int have = _histogram_eps[cell+1] - base; + const int want = _histogram_full[cell]; + const int diff = have - want; + if( diff <= 0 ) + { + continue; + } + + if( mode == GridFilterConfig::LargestScaleFirst ) + { + const int base = _histogram_eps[cell]; + for( int d=0; dignore = true; + } + } + else // mode == GridFilterConfig::SmallestScaleFirst + { + const int base = _histogram_eps[cell+1]; + for( int d=0; dignore = true; + } + } + } +} + +/* Discard some extrema that exceed a conf.getFilterMaxExtrema(). + * The caller should ensure that filtering is actually needed. */ +__host__ +int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf, int ext_total ) +{ + /* At this time, we have host-side information about ext_ct[o], the number + * of extrema we have found in octave o, and we have summed it up on the + * host size. However, other values in the hct and dct data structures + * have not been computed yet. + * The extrema are only known in the InitialExtrema structure. We want to + * perform grid filtering before their orientation is computed and they + * are copied into the larger Extrema data structure. + */ + const int max_extrema = conf.getFilterMaxExtrema( ); + + init( buf, ct ); + count_cells( ); + make_histogram_prefix_sum( ); + sort_by_cell_and_scale( ); + level_histogram( max_extrema ); + prune_extrema( conf.getFilterSorting() ); + + int filtered_ext_total = 0; + for( int o=0; oext_ct[o]; + int counter = 0; + for( int i=0; ii_ext_dat[o][i]; + if( iext->ignore == false ) + { + buf->i_ext_off[o][counter] = i; + counter += 1; + } + } + ct->ext_ct[o] = counter; + filtered_ext_total += counter; + } + + return filtered_ext_total; +} +}; // namespace popsift + +#endif // not defined(DISABLE_GRID_FILTER) + diff --git a/src/popsift/filtergrid.h b/src/popsift/filtergrid.h new file mode 100755 index 00000000..b4f987e0 --- /dev/null +++ b/src/popsift/filtergrid.h @@ -0,0 +1,104 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include "sift_config.h" +#include "sift_conf.h" + +#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) + +namespace popsift +{ + +class ExtremaBuffers; +class ExtremaCounters; +class InitialExtremum; + +class FilterGrid +{ + int* _histogram; + int* _histogram_full; + int* _histogram_eps; // exclusive prefix sum; + + const int _slots; + ExtremaBuffers* _buf; + ExtremaCounters* _ct; + + int _alloc_size; + int* _sorting_index; + int* _cells; + float* _scales; + InitialExtremum** _initial_extrema_pointers; + +public: + FilterGrid( const Config& conf ); + + ~FilterGrid( ); + + int filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf, int ext_total ); + +private: + /** Initialize the FilterGrid structure with the current counters + * and buffers. + * Compute the exclusive prefix sum of extrema per octave. + */ + void init( ExtremaBuffers* buf, ExtremaCounters* ct ); + + /** We are filling the _histogram_full by counting how frequent + * extrema appear in every cell. + * Since order is irrelevant for this operation, we ignore + * the indirection through the _sorting_index while we collect + * the histogram. + */ + void count_cells( ); + + /** To get the offsets for each cell into the sorted array + * _sorting_index, we can simple compute the exclusive prefix + * sum over _histogram_full. + * For convenience, it has an extra slot for the final sum. + */ + void make_histogram_prefix_sum( ); + + /** We sort the index of extrema (not the extrema themselves). + * The end result is in _sorting_index, which gives us indirect + * access (sorted by priority) to the arrays _cells, _scales + * and _initial_extrema_pointers. + */ + void sort_by_cell_and_scale( ); + + /** Compute a maximum nunmber of entries in every cell and + * update _histogram_full. + * Each cell has an equal share in the maximum number of + * extrema, but when a cell does not use its share fully, + * other cells can have it. The total number of extrema + * assigned can slightly exceed the given max value. + */ + void level_histogram( const int max_extrema ); + + void prune_extrema( GridFilterConfig::Mode mode ); +}; + +}; // namespace popsift + +#else // not defined(DISABLE_GRID_FILTER) + +namespace popsift +{ +class FilterGrid +{ +public: + FilterGrid( const Config& ) { } + + int filter( const Config&, ExtremaCounters*, ExtremaBuffers*, int ext_total ) { + return ext_total; + } +}; +}; // namespace popsift + +#endif // not defined(DISABLE_GRID_FILTER) + diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index b8329cf8..e7541e7a 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -299,7 +299,7 @@ public: template __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, - int debug_octave, + int octave, int width, int height, uint32_t maxlevel, @@ -344,7 +344,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, if( ! f.first_contrast_ok( val ) ) return false; if( ! is_extremum( dog, x-1, y-1, level-1 ) ) { - // if( debug_octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n"); + // if( octave==0 && level==2 && x==14 && y==73 ) printf("But I fail\n"); return false; } @@ -503,6 +503,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, ec.ypos = yn; ec.lpos = (int)roundf(sn); ec.sigma = d_consts.sigma0 * pow(d_consts.sigma_k, sn); // * 2; + ec.scale = ec.sigma * powf( 2.0f, octave ); ec.cell = floorf( yn / h_grid_divider ) * grid_width + floorf( xn / w_grid_divider ); // const float sigma_k = powf(2.0f, 1.0f / levels ); diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu deleted file mode 100644 index 08b60431..00000000 --- a/src/popsift/s_filtergrid.cu +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright 2016-2017, Simula Research Laboratory - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ -#include "sift_config.h" -#include "sift_extremum.h" -#include "sift_pyramid.h" - -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - -#if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace popsift -{ - -struct FunctionSort_IncCell_DecScale -{ - __device__ - inline bool operator()( const thrust::tuple& l, const thrust::tuple& r ) const - { - return ( ( thrust::get<0>(l) < thrust::get<0>(r) ) || - ( thrust::get<0>(l) == thrust::get<0>(r) && thrust::get<1>(l) > thrust::get<1>(r) ) ); - } -}; - -struct FunctionSort_IncCell_IncScale -{ - __device__ - inline bool operator()( const thrust::tuple& l, const thrust::tuple& r ) const - { - return ( ( thrust::get<0>(l) < thrust::get<0>(r) ) || - ( thrust::get<0>(l) == thrust::get<0>(r) && thrust::get<1>(l) < thrust::get<1>(r) ) ); - } -}; - -struct FunctionExtractCell -{ - ExtremaBuffers* _buf; - - FunctionExtractCell( ExtremaBuffers* buf ) - { - _buf = buf; - } - - __device__ - inline thrust::tuple operator()( const thrust::tuple& val) const - { - /* During the filter stage, the i_ext_dat array is still compact (all intial - * extrema do still have ignore==false), so that we can access every entry - * directly). - */ - const int octave = thrust::get<0>(val); - const int idx = thrust::get<1>(val); - InitialExtremum& e = _buf->i_ext_dat[octave][idx]; - - return thrust::make_tuple( e.cell, e.sigma * powf( 2.0f, octave ) ); - } -}; - -struct FunctionIsAbove -{ - int _limit; - explicit FunctionIsAbove( int limit ) : _limit(limit) { } - - __host__ __device__ - inline bool operator()( int val ) const - { - return val > _limit; - } -}; - -struct FunctionDisableExtremum -{ - ExtremaBuffers* _buf; - - FunctionDisableExtremum( ExtremaBuffers* buf ) - { - _buf = buf; - } - - __device__ - inline void operator()( const thrust::tuple& val) const - { - const int octave = thrust::get<0>(val); - const int idx = thrust::get<1>(val); - InitialExtremum& e = _buf->i_ext_dat[octave][idx]; - e.ignore = true; - } -}; - -struct FunctionExtractIgnored -{ - ExtremaBuffers* _buf; - - FunctionExtractIgnored( ExtremaBuffers* buf ) - { - _buf = buf; - } - - __device__ - inline int operator()( int idx, int octave ) const - { - InitialExtremum& e = _buf->i_ext_dat[octave][idx]; - if( e.ignore ) - return 0; - else - return 1; - } -}; - -/* discard extrema that exceed a conf.getFilterMaxExtrema() */ -__host__ -int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) -{ - /* At this time, we have host-side information about ext_ct[o], the number - * of extrema we have found in octave o, and we have summed it up on the - * host size. However, other values in the hct and dct data structures - * have not been computed yet. - * The extrema are only known in the InitialExtrema structure. We want to - * perform grid filtering before their orientation is computed and they - * are copied into the larger Extrema data structure. - */ - const int slots = conf.getFilterGridSize(); - - thrust::device_vector octave_index( ext_total ); - thrust::device_vector iext_index ( ext_total ); - thrust::device_vector cell_values ( ext_total ); - thrust::device_vector scale_values( ext_total ); - thrust::device_vector cell_counts ( slots * slots ); - thrust::device_vector cell_offsets( slots * slots ); - - int sum = 0; - for( int o=0; oext_ct[o]; - if( ocount > 0 ) { - cudaStream_t oct_str = _octaves[o].getStream(); - - // fill a continuous device array with octave of all initial extrema - thrust::fill( thrust::cuda::par.on(oct_str), - octave_index.begin() + sum, - octave_index.begin() + sum + ocount, - o ); - // fill a continuous device array with index within octave of all initial extrema - thrust::sequence( thrust::cuda::par.on(oct_str), - iext_index.begin() + sum, - iext_index.begin() + sum + ocount ); - sum += ocount; - } - } - - cudaDeviceSynchronize(); - - // extract cell and scale value for all initial extrema - FunctionExtractCell fun_extract_cell( _buf ); - - thrust::transform( thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.end(), - iext_index.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - fun_extract_cell ); - if( conf.getFilterSorting() == Config::LargestScaleFirst ) - { - FunctionSort_IncCell_DecScale fun_sort; - thrust::sort_by_key( - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.end(), - scale_values.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin() ) ), - fun_sort ); - } - else if( conf.getFilterSorting() == Config::SmallestScaleFirst ) - { - FunctionSort_IncCell_IncScale fun_sort; - thrust::sort_by_key( - thrust::make_zip_iterator( thrust::make_tuple( cell_values.begin(), - scale_values.begin() ) ), - thrust::make_zip_iterator( thrust::make_tuple( cell_values.end(), - scale_values.end() ) ), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin() ) ), - fun_sort ); - } - else - { - // sort (octave,index,scale) tuples by their cell values (in which cell are they located) - thrust::sort_by_key( - cell_values.begin(), - cell_values.end(), - thrust::make_zip_iterator( thrust::make_tuple( octave_index.begin(), - iext_index. begin(), - scale_values.begin() ) ) ); - } - - // count the number of entries in all cells (in one operation instead of several reduce_if) - thrust::reduce_by_key( cell_values.begin(), cell_values.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - cell_counts.begin() ); - - // compute the offsets from cell_values start for each of the (pre-sorted) cell values - thrust::exclusive_scan( cell_counts.begin(), cell_counts.end(), cell_offsets.begin() ); - - const int n = slots * slots; - thrust::host_vector h_cell_counts ( n ); - thrust::host_vector h_cell_permute ( n ); - thrust::host_vector h_cell_offsets ( n ); - thrust::host_vector h_cell_limits ( n ); - thrust::host_vector cell_count_prefix_sums( n ); - thrust::host_vector cell_count_sumup ( n ); - - // move to host code - computing the limits on the GPU is too wasteful - h_cell_counts = cell_counts; - - // offset to beginning of each cell value - recompute faster than copy - thrust::exclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), h_cell_offsets.begin() ); - - // offset to end indeces of each cell value - could shift h_cell_offsets and sum up counts - thrust::inclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), h_cell_limits .begin() ); - - // the cell filter algorithm requires the cell counts in increasing order, cell_permute - // maps new position to original index - thrust::sequence( h_cell_permute.begin(), h_cell_permute.end() ); - thrust::sort_by_key( h_cell_counts.begin(), h_cell_counts.end(), h_cell_permute.begin() ); - - // several steps to find the cells that must loose extrema - - // inclusive prefix sum - thrust::inclusive_scan( h_cell_counts.begin(), h_cell_counts.end(), cell_count_prefix_sums.begin() ); - - thrust::host_vector h_reverse_index(n); - thrust::sequence( h_reverse_index.begin(), h_reverse_index.end(), - n-1, - -1 ); - - // sumup[i] = prefix sum[i] + sum( cell[i] copied into remaining cells ) - thrust::transform( h_cell_counts.begin(), h_cell_counts.end(), - h_reverse_index.begin(), - cell_count_sumup.begin(), - thrust::multiplies() ); - thrust::transform( cell_count_sumup.begin(), cell_count_sumup.end(), - cell_count_prefix_sums.begin(), - cell_count_sumup.begin(), - thrust::plus() ); - - FunctionIsAbove function_is_above( conf.getFilterMaxExtrema() ); - - // count cells that are above the extrema limit after the summing. Those must share the - // reduction of extrema - int ct = thrust::count_if( cell_count_sumup.begin(), cell_count_sumup.end(), - function_is_above ); - - float tailaverage = float( thrust::reduce( &h_cell_counts[n-ct], &h_cell_counts[n] ) ) / ct; - - int newlimit = ::ceilf( tailaverage - ( ext_total - conf.getFilterMaxExtrema() ) / ct ); - - // clamp all cells to the computed limit - the total is now less than n extrema off - thrust::transform( h_cell_counts.begin(), h_cell_counts.end(), - thrust::make_constant_iterator(newlimit), - h_cell_counts.begin(), - thrust::minimum() ); - - // back to original order - thrust::sort_by_key( h_cell_permute.begin(), h_cell_permute.end(), h_cell_counts.begin() ); - - // transfer counts back to device - cell_counts = h_cell_counts; - - for( int i=0; i grid( ext_total ); - - int ret_ext_total = 0; - - for( int o=0; oext_ct[o]; - - if( ocount > 0 ) { - FunctionExtractIgnored fun_extract_ignore( _buf ); - thrust::identity fun_id; - - grid.resize( ocount ); - - thrust::transform( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(ocount), - thrust::make_constant_iterator(o), - grid.begin(), - fun_extract_ignore ); - - thrust::device_ptr off_ptr = thrust::device_pointer_cast( _buf->i_ext_off[o] ); - - thrust::copy_if( thrust::make_counting_iterator(0), - thrust::make_counting_iterator(ocount), - grid.begin(), - off_ptr, - fun_id ); - - _ct->ext_ct[o] = thrust::reduce( grid.begin(), grid.end() ); - - ret_ext_total += _ct->ext_ct[o]; - } - } - - nvtxRangePushA( "writing back count" ); - writeDescCountersToDevice( ); - nvtxRangePop( ); - - return ret_ext_total; -} -}; // namespace popsift - -#else // not defined(DISABLE_GRID_FILTER) - -namespace popsift -{ -/* do nothing unless we have CUDA v 8 or newer */ -__host__ -int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) -{ - return ext_total; -} -}; // namespace popsift - -#endif // not defined(DISABLE_GRID_FILTER) - diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index cfbef679..0c7a6dfe 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -299,10 +299,13 @@ void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf, const int total_e __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { + ct->make_extrema_prefix_sums( ); + /* replaced by the __device__ method above: ct->ext_ps[0] = 0; for( int o=1; oext_ps[o] = ct->ext_ps[o-1] + ct->ext_ct[o-1]; } + */ for( int o=0; oext_ct[o] == 0 ) { @@ -345,7 +348,7 @@ void Pyramid::orientation( const Config& conf ) // therefore add 10% slack. if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < ext_total ) { - ext_total = extrema_filter_grid( conf, ext_total ); + ext_total = _fg.filter( conf, _ct, _buf, ext_total ); } reallocExtrema( ext_total ); diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index b96901f7..0f1e1409 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -36,12 +36,9 @@ Config::Config( ) , _log_mode( Config::None ) , _scaling_mode( Config::ScaleDefault ) , _desc_mode( Config::Loop ) - , _grid_filter_mode( Config::RandomScale ) , verbose( false ) // , _max_extrema( 20000 ) , _max_extrema( 100000 ) - , _filter_max_extrema( -1 ) - , _filter_grid_size( 2 ) , _assume_initial_blur( true ) , _initial_blur( 0.5f ) , _normalization_mode( getNormModeDefault() ) @@ -150,6 +147,16 @@ const char* Config::getGaussModeUsage( ) "relative (synonym for vlfeat-hw-interpolated)"; } +void Config::setFilterMaxExtrema( int ext ) +{ + _grid_filter._max_extrema = ext; +} + +void Config::setFilterGridSize( int sz ) +{ + _grid_filter._size = sz; +} + bool Config::getCanFilterExtrema() const { #if __CUDACC_VER_MAJOR__ >= 8 @@ -162,18 +169,18 @@ bool Config::getCanFilterExtrema() const void Config::setFilterSorting( const std::string& text ) { if( text == "up" ) - _grid_filter_mode = Config::SmallestScaleFirst; + _grid_filter._mode = GridFilterConfig::SmallestScaleFirst; else if( text == "down" ) - _grid_filter_mode = Config::LargestScaleFirst; + _grid_filter._mode = GridFilterConfig::LargestScaleFirst; else if( text == "random" ) - _grid_filter_mode = Config::RandomScale; + _grid_filter._mode = GridFilterConfig::RandomScale; else POP_FATAL( "filter sorting mode must be one of up, down or random" ); } -void Config::setFilterSorting( Config::GridFilterMode m ) +void Config::setFilterSorting( GridFilterConfig::Mode m ) { - _grid_filter_mode = m; + _grid_filter._mode = m; } void Config::setVerbose( bool on ) @@ -320,8 +327,6 @@ std::string Config::getPeakThreshUsage( ) } void Config::setPrintGaussTables() { _print_gauss_tables = true; } -void Config::setFilterMaxExtrema( int ext ) { _filter_max_extrema = ext; } -void Config::setFilterGridSize( int sz ) { _filter_grid_size = sz; } void Config::setInitialBlur( float blur ) { @@ -378,4 +383,10 @@ bool Config::equal( const Config& other ) const return true; } +GridFilterConfig::GridFilterConfig( ) + : _max_extrema( -1 ) + , _size( 2 ) + , _mode( RandomScale ) +{ } + }; // namespace popsift diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h index 4d615472..47151f85 100644 --- a/src/popsift/sift_conf.h +++ b/src/popsift/sift_conf.h @@ -23,6 +23,42 @@ namespace popsift { +struct GridFilterConfig +{ + /** + * @brief Filtering strategy. + * + * To reduce time used in descriptor extraction, some extrema can be filtered + * immediately after finding them. It is possible to keep those with the largest + * scale (LargestScaleFirst), smallest scale (SmallestScaleFirst), or a random + * selection. Note that largest and smallest give a stable result, random does not. + */ + enum Mode + { + /// keep a random selection + RandomScale, + /// keep those with the largest scale + LargestScaleFirst, + /// keep those with the smallest scale + SmallestScaleFirst + }; + + /// The maximum number of extrema that are returned. There may be + /// several descriptors for each extremum. + int _max_extrema; + + /// Used to achieve an approximation of _max_entrema + /// Subdivide the image in this number of vertical and horizontal tiles, + /// i.e. the grid is actually _grid_size X _grid_size tiles. + /// default: 1 + int _size; + + /// default: RandomScale + Mode _mode; + + GridFilterConfig( ); +}; + /** * @brief Struct containing the parameters that control the extraction algorithm */ @@ -114,23 +150,6 @@ struct Config NormDefault = RootSift }; - /** - * @brief Filtering strategy. - * - * To reduce time used in descriptor extraction, some extrema can be filtered - * immediately after finding them. It is possible to keep those with the largest - * scale (LargestScaleFirst), smallest scale (SmallestScaleFirst), or a random - * selection. Note that largest and smallest give a stable result, random does not. - */ - enum GridFilterMode { - /// keep a random selection - RandomScale, - /// keep those with the largest scale - LargestScaleFirst, - /// keep those with the smallest scale - SmallestScaleFirst - }; - /** * @brief Processing mode. * @@ -245,7 +264,7 @@ struct Config void setFilterMaxExtrema( int extrema ); void setFilterGridSize( int sz ); void setFilterSorting( const std::string& direction ); - void setFilterSorting( GridFilterMode m ); + void setFilterSorting( GridFilterConfig::Mode m ); /// print Gauss spans and tables? bool ifPrintGaussTables() const; @@ -340,24 +359,25 @@ struct Config * should be computed. Default is -1, which sets the hard limit defined * by "number of octaves * getMaxExtrema()". */ - int getFilterMaxExtrema() const { return _filter_max_extrema; } + int getFilterMaxExtrema() const { return _grid_filter._max_extrema; } /** * @brief Get the grid size for filtering. * * To avoid that grid filtering happens only in a tiny piece of an image, * the image is split into getFilterGridSize() X getFilterGridSize() tiles - * and we allow getFilterMaxExtrema() / getFilterGridSize() extrema in - * each tile. + * and we allow approximately getFilterMaxExtrema() / getFilterGridSize() + * extrema in each tile (grid cells can have more if one cell does not use + * its share. */ - int getFilterGridSize() const { return _filter_grid_size; } + int getFilterGridSize() const { return _grid_filter._size; } /** * @brief Get the filtering mode. * @return the filtering mode. * @see GridFilterMode */ - GridFilterMode getFilterSorting() const { return _grid_filter_mode; } + GridFilterConfig::Mode getFilterSorting() const { return _grid_filter._mode; } /** * @brief Get the scaling mode. @@ -398,9 +418,6 @@ struct Config /// default: DescMode::Loop DescMode _desc_mode; - /// default: RandomScale - GridFilterMode _grid_filter_mode; - public: bool verbose; @@ -409,15 +426,7 @@ struct Config /// This parameter changes memory requirements. int _max_extrema; - /// The maximum number of extrema that are returned. There may be - /// several descriptors for each extremum. - int _filter_max_extrema; - - /// Used to achieve an approximation of _max_entrema - /// Subdivide the image in this number of vertical and horizontal tiles, - /// i.e. the grid is actually _grid_size X _grid_size tiles. - /// default: 1 - int _filter_grid_size; + GridFilterConfig _grid_filter; /// Modes are computation according to VLFeat or OpenCV, /// or fixed size. Default is VLFeat mode. diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h index 0363a02e..0af257cc 100755 --- a/src/popsift/sift_extremum.h +++ b/src/popsift/sift_extremum.h @@ -28,10 +28,18 @@ struct InitialExtremum float ypos; /// extremum refined into this level int lpos; - /// scale + /// scale - this is the accurate floating-point level within this octave float sigma; + /// index into the grid for grid-based extrema filtering int cell; + /// scale is sigma * powf( 2.0f, octave ) - sigma in the entire pyramid + float scale; + + /** during histogram creation, assign a unique number to this + * specific extremum in the cell (using the result of atomicAdd) + */ + int rank_in_cell; /// true if this extremum has been filtered bool ignore; /// if any initial extrema are ignored, new index for Extremum diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index 05d7d1c8..5dbf5f13 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -102,21 +102,16 @@ Pyramid::Pyramid( const Config& config, , _levels( config.levels + 3 ) , _assume_initial_blur( config.hasInitialBlur() ) , _initial_blur( config.getInitialBlur() ) + , _fg( config ) { _octaves = new Octave[_num_octaves]; int w = width; int h = height; - // memset( &hct, 0, sizeof(ExtremaCounters) ); - // cudaMemcpyToSymbol( dct, &hct, sizeof(ExtremaCounters), 0, cudaMemcpyHostToDevice ); - - cudaMallocManaged( &_ct, sizeof(ExtremaCounters) ); + _ct = popsift::cuda::malloc_mgdT( 1, __FILE__, __LINE__ ); memset( _ct, 0, sizeof(ExtremaCounters) ); - cudaMallocManaged( &_buf, sizeof(ExtremaBuffers) ); - memset( _buf, 0, sizeof(ExtremaBuffers) ); - _d_extrema_num_blocks = popsift::cuda::malloc_devT( _num_octaves, __FILE__, __LINE__ ); for (int o = 0; o<_num_octaves; o++) { @@ -126,28 +121,8 @@ Pyramid::Pyramid( const Config& config, h = ceilf(h / 2.0f); } - int sz = _num_octaves * h_consts.max_extrema; - _buf->i_ext_dat[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->i_ext_off[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - for (int o = 1; o<_num_octaves; o++) { - _buf->i_ext_dat[o] = _buf->i_ext_dat[0] + (o*h_consts.max_extrema); - _buf->i_ext_off[o] = _buf->i_ext_off[0] + (o*h_consts.max_extrema); - } - for (int o = _num_octaves; oi_ext_dat[o] = nullptr; - _buf->i_ext_off[o] = nullptr; - } - - sz = h_consts.max_extrema; - _buf->extrema = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->features = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - - _buf->ext_allocated = sz; - - sz = max( 2 * h_consts.max_extrema, h_consts.max_orientations ); - _buf->desc = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->feat_to_ext_map = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->ori_allocated = sz; + _buf = popsift::cuda::malloc_mgdT( 1, __FILE__, __LINE__ ); + _buf->init( _num_octaves, h_consts.max_extrema, h_consts.max_orientations ); cudaStreamCreate( &_download_stream ); } @@ -166,43 +141,19 @@ void Pyramid::resetDimensions( const Config& conf, int width, int height ) void Pyramid::reallocExtrema( int numExtrema ) { - if( numExtrema > _buf->ext_allocated ) { - numExtrema = ( ( numExtrema + 1024 ) & ( ~(1024-1) ) ); - cudaFree( _buf->extrema ); - cudaFree( _buf->features ); - - int sz = numExtrema; - _buf->extrema = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->features = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->ext_allocated = sz; - - numExtrema *= 2; - if( numExtrema > _buf->ori_allocated ) { - cudaFree( _buf->desc ); - cudaFree( _buf->feat_to_ext_map ); - - sz = numExtrema; - _buf->desc = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->feat_to_ext_map = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); - _buf->ori_allocated = sz; - } - } + _buf->growBuffers( numExtrema ); } Pyramid::~Pyramid() { cudaStreamDestroy( _download_stream ); - cudaFree( _d_extrema_num_blocks ); - cudaFree( _buf->i_ext_dat[0] ); - cudaFree( _buf->i_ext_off[0] ); - cudaFree( _buf->features ); - cudaFree( _buf->extrema ); - cudaFree( _buf->desc ); - cudaFree( _buf->feat_to_ext_map ); + _buf->uninit( ); + cudaFree( _buf ); + + cudaFree( _d_extrema_num_blocks ); cudaFree( _ct ); - cudaFree( _buf ); delete[] _octaves; } @@ -427,5 +378,60 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* } } +void ExtremaBuffers::init( int num_octaves, int max_extrema, int max_orientations ) +{ + memset( this, 0, sizeof(ExtremaBuffers) ); + + const int sz = num_octaves * max_extrema; + i_ext_dat[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + i_ext_off[0] = popsift::cuda::malloc_mgdT( sz, __FILE__, __LINE__); + for (int o = 1; o < num_octaves; o++) { + i_ext_dat[o] = i_ext_dat[0] + (o*max_extrema); + i_ext_off[o] = i_ext_off[0] + (o*max_extrema); + } + + extrema = popsift::cuda::malloc_mgdT( max_extrema, __FILE__, __LINE__); + features = popsift::cuda::malloc_mgdT ( max_extrema, __FILE__, __LINE__); + ext_allocated = max_extrema; + + const int osz = max( 2 * max_extrema, max_orientations ); + desc = popsift::cuda::malloc_mgdT( osz, __FILE__, __LINE__); + feat_to_ext_map = popsift::cuda::malloc_mgdT ( osz, __FILE__, __LINE__); + ori_allocated = osz; +} + +void ExtremaBuffers::uninit( ) +{ + cudaFree( feat_to_ext_map ); + cudaFree( desc ); + cudaFree( features ); + cudaFree( extrema ); + cudaFree( i_ext_off[0] ); + cudaFree( i_ext_dat[0] ); +} + +void ExtremaBuffers::growBuffers( int numExtrema ) +{ + numExtrema = ( ( numExtrema + 1024 ) & ( ~(1024-1) ) ); + + if( numExtrema <= ext_allocated ) return; + + cudaFree( extrema ); + cudaFree( features ); + + extrema = popsift::cuda::malloc_mgdT( numExtrema, __FILE__, __LINE__); + features = popsift::cuda::malloc_mgdT ( numExtrema, __FILE__, __LINE__); + ext_allocated = numExtrema; + + numExtrema *= 2; + if( numExtrema > ori_allocated ) { + cudaFree( desc ); + cudaFree( feat_to_ext_map ); + + desc = popsift::cuda::malloc_mgdT ( numExtrema, __FILE__, __LINE__); + feat_to_ext_map = popsift::cuda::malloc_mgdT( numExtrema, __FILE__, __LINE__); + ori_allocated = numExtrema; + } +} } // namespace popsift diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 169c7a76..3bb7f7a9 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -8,6 +8,7 @@ #pragma once #include "features.h" +#include "filtergrid.h" #include "s_image.h" #include "sift_conf.h" #include "sift_constants.h" @@ -18,6 +19,9 @@ namespace popsift { +/** Datastructure in managed memory that holds counters for + * initially collected extrema and orientations. + */ struct ExtremaCounters { /* The number of extrema found per octave */ @@ -32,8 +36,31 @@ struct ExtremaCounters int ext_total; int ori_total; + + /** host and device function helper function that updates the exclusive + * prefix sum for extrema counts, updates ext_total and returns the total + * number of extrema. + * Note that exclusive prefix sum on the device could use several threads + * but we are talking about a constant size of 20. + */ + __device__ __host__ inline + int make_extrema_prefix_sums( ) + { + ext_ps[0] = 0; + for( int o=1; o Date: Fri, 26 Feb 2021 17:19:04 +0100 Subject: [PATCH 11/30] [popsift] update demo prog usage for new grid filter --- src/application/CMakeLists.txt | 4 +- src/application/common_opt.cpp | 111 +++++++++++++++++++++++++++++++++ src/application/common_opt.h | 18 ++++++ src/application/main.cpp | 65 ++----------------- src/application/match.cpp | 54 +++------------- src/popsift/sift_conf.cu | 27 ++++++-- src/popsift/sift_conf.h | 5 +- 7 files changed, 167 insertions(+), 117 deletions(-) create mode 100755 src/application/common_opt.cpp create mode 100755 src/application/common_opt.h diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt index 3b28cec8..737d472c 100755 --- a/src/application/CMakeLists.txt +++ b/src/application/CMakeLists.txt @@ -68,7 +68,7 @@ endif(PopSift_USE_NVTX_PROFILING) # popsift-demo ############################################################# -add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h) +add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h common_opt.cpp) set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11) @@ -82,7 +82,7 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS}) # popsift-match ############################################################# -add_executable(popsift-match match.cpp pgmread.cpp pgmread.h) +add_executable(popsift-match match.cpp pgmread.cpp pgmread.h common_opt.cpp) set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11) diff --git a/src/application/common_opt.cpp b/src/application/common_opt.cpp new file mode 100755 index 00000000..229d35f2 --- /dev/null +++ b/src/application/common_opt.cpp @@ -0,0 +1,111 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "common_opt.h" + +// #include +// #include +// #include +// #include +// #include + +// #include + +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +#include + +using namespace std; +using namespace boost::program_options; + +void option_init_parameters( popsift::Config& config, options_description& parameters ) +{ + parameters.add_options() + ("octaves", + value(&config.octaves)->default_value(config.getOctaves()), + "Number of octaves") + ("levels", + value(&config.levels)->default_value(config.getLevels()), + "Number of levels per octave") + ("sigma", + value()->notifier([&](float f) { config.setSigma(f); })->default_value(config.getSigma()), + "Initial sigma value") + ("threshold", + value()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()), + popsift::Config::getPeakThreshUsage().c_str() ) + ("edge-threshold", + value()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()), + popsift::Config::getEdgeThreshUsage().c_str() ) + ("edge-limit", + value()->notifier([&](float f) { config.setEdgeLimit(f); }), + "synonym to --edge-threshold" ) + ("downsampling", + value()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()), + "Downscale width and height of input by 2^N") + ("initial-blur", + value()->notifier([&](float f) {config.setInitialBlur(f); })->default_value(config.getInitialBlur()), + "Assume initial blur, subtract when blurring first time"); +} + +void option_init_modes( popsift::Config& config, options_description& modes ) +{ + modes.add_options() + ( "gauss-mode", + value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), + popsift::Config::getGaussModeUsage() ) + // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " + // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" + ( "desc-mode", + value()->notifier([&](const std::string& s) { config.setDescMode(s); }), + popsift::Config::getDescModeUsage() ) + ( "popsift-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), + "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " + "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") + ( "vlfeat-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), + "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " + "In extrema refinement, steps up to 0.6, levels remain unchanged, " + "do not reject points when reaching max iterations, " + "first contrast threshold is .8 * peak thresh.") + ( "opencv-mode", + bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), + "During the initial upscale, shift pixels by 0.5. " + "In extrema refinement, steps up to 0.5, " + "reject points when reaching max iterations, " + "first contrast threshold is floor(.5 * peak thresh). " + "Computed filter width are lower than VLFeat/PopSift") + ( "direct-scaling", + bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), + "Direct each octave from upscaled orig instead of blurred level.") + ( "norm-multi", + value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), + "Multiply the descriptor by pow(2,).") + ( "norm-mode", + value()->notifier([&](const std::string& s) { config.setNormMode(s); }), + popsift::Config::getNormModeUsage() ) + ( "root-sift", + bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), + "synonym to --norm-mode=RootSift" ) + ( "filter-max-extrema", + value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), + "Approximate max number of extrema.") + ( "filter-grid", + value()->notifier([&](int f) {config.setFilterGridSize(f); }), + "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") + ( "filter-sort", + value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), + popsift::Config::getFilterGridModeUsage() ); + +} + diff --git a/src/application/common_opt.h b/src/application/common_opt.h new file mode 100755 index 00000000..1da06100 --- /dev/null +++ b/src/application/common_opt.h @@ -0,0 +1,18 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include + +#include + +void option_init_parameters( popsift::Config& config, + boost::program_options::options_description& parameters ); +void option_init_modes( popsift::Config& config, + boost::program_options::options_description& modes ); + diff --git a/src/application/main.cpp b/src/application/main.cpp index e20c376f..46c99b20 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -29,6 +29,7 @@ #include #endif #include "pgmread.h" +#include "common_opt.h" #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) #include @@ -60,69 +61,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in ("input-file,i", value(&inputFile)->required(), "Input file"); } + options_description parameters("Parameters"); - { - parameters.add_options() - ("octaves", - value(&config.octaves)->default_value(config.getOctaves()), - "Number of octaves") - ("levels", - value(&config.levels)->default_value(config.getLevels()), - "Number of levels per octave") - ("sigma", - value()->notifier([&](float f) { config.setSigma(f); })->default_value(config.getSigma()), - "Initial sigma value") - ("threshold", - value()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()), - popsift::Config::getPeakThreshUsage().c_str() ) - ("edge-threshold", - value()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()), - popsift::Config::getEdgeThreshUsage().c_str() ) - ("edge-limit", - value()->notifier([&](float f) { config.setEdgeLimit(f); }), - "synonym to --edge-threshold" ) - ("downsampling", - value()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()), - "Downscale width and height of input by 2^N") - ("initial-blur", - value()->notifier([&](float f) {config.setInitialBlur(f); })->default_value(config.getInitialBlur()), - "Assume initial blur, subtract when blurring first time"); - } options_description modes("Modes"); - { - modes.add_options() - ( "gauss-mode", value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), - popsift::Config::getGaussModeUsage() ) - // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " - // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" - ( "desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - popsift::Config::getDescModeUsage() ) - ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), - "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") - ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), - "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " - "In extrema refinement, steps up to 0.6, levels remain unchanged, " - "do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh.") - ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), - "During the initial upscale, shift pixels by 0.5. " - "In extrema refinement, steps up to 0.5, " - "reject points when reaching max iterations, " - "first contrast threshold is floor(.5 * peak thresh). " - "Computed filter width are lower than VLFeat/PopSift") - ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), - "Direct each octave from upscaled orig instead of blurred level.") - ("norm-multi", value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,).") - ( "norm-mode", value()->notifier([&](const std::string& s) { config.setNormMode(s); }), - popsift::Config::getNormModeUsage() ) - ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), - "synonym to --norm-mode=RootSift" ) - ("filter-max-extrema", value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.") - ("filter-grid", value()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") - ("filter-sort", value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down"); - } + option_init_parameters( config, parameters ); + option_init_modes( config, modes ); + options_description informational("Informational"); { informational.add_options() diff --git a/src/application/match.cpp b/src/application/match.cpp index 852d9b62..0898774f 100755 --- a/src/application/match.cpp +++ b/src/application/match.cpp @@ -29,6 +29,7 @@ #include #endif #include "pgmread.h" +#include "common_opt.h" #if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) #include @@ -45,7 +46,8 @@ static bool write_as_uchar {false}; static bool dont_write {false}; static bool pgmread_loading {false}; -static void parseargs(int argc, char** argv, popsift::Config& config, string& lFile, string& rFile) { +static void parseargs(int argc, char** argv, popsift::Config& config, string& lFile, string& rFile) +{ using namespace boost::program_options; options_description options("Options"); @@ -59,55 +61,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& lF ("right,r", value(&rFile)->required(), "\"Right\" input file"); } + options_description parameters("Parameters"); - { - parameters.add_options() - ("octaves", value(&config.octaves), "Number of octaves") - ("levels", value(&config.levels), "Number of levels per octave") - ("sigma", value()->notifier([&](float f) { config.setSigma(f); }), "Initial sigma value") - - ("threshold", value()->notifier([&](float f) { config.setThreshold(f); }), "Contrast threshold") - ("edge-threshold", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("edge-limit", value()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold") - ("downsampling", value()->notifier([&](float f) { config.setDownsampling(f); }), "Downscale width and height of input by 2^N") - ("initial-blur", value()->notifier([&](float f) {config.setInitialBlur(f); }), "Assume initial blur, subtract when blurring first time"); - } options_description modes("Modes"); - { - modes.add_options() - ( "gauss-mode", value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), - popsift::Config::getGaussModeUsage() ) - ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") - ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), - "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") - ("vlfeat-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::VLFeat); }), - "During the initial upscale, shift pixels by 1. That creates a sharper upscaled image. " - "In extrema refinement, steps up to 0.6, levels remain unchanged, " - "do not reject points when reaching max iterations, " - "first contrast threshold is .8 * peak thresh.") - ("opencv-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::OpenCV); }), - "During the initial upscale, shift pixels by 0.5. " - "In extrema refinement, steps up to 0.5, " - "reject points when reaching max iterations, " - "first contrast threshold is floor(.5 * peak thresh). " - "Computed filter width are lower than VLFeat/PopSift") - ("direct-scaling", bool_switch()->notifier([&](bool b) { if(b) config.setScalingMode(popsift::Config::ScaleDirect); }), - "Direct each octave from upscaled orig instead of blurred level.") - ("norm-multi", value()->notifier([&](int i) {config.setNormalizationMultiplier(i); }), "Multiply the descriptor by pow(2,).") - ( "norm-mode", value()->notifier([&](const std::string& s) { config.setNormMode(s); }), - popsift::Config::getNormModeUsage() ) - ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }), - popsift::Config::getNormModeUsage() ) - ("filter-max-extrema", value()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.") - ("filter-grid", value()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)") - ("filter-sort", value()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down"); - } + option_init_parameters( config, parameters ); + option_init_modes( config, modes ); + options_description informational("Informational"); { informational.add_options() diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index 0f1e1409..8b944156 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -166,16 +166,31 @@ bool Config::getCanFilterExtrema() const #endif } +const char* Config::getFilterGridModeUsage( ) +{ + return "l/largest (default) or s/smallest. " + "largest: keep the extrema with the largest scales from each grid-filtered cell. " + "smallest: keep the extrema with the smallest scales. " + "The first octave has the smallest scales."; +} + +GridFilterConfig::Mode Config::getFilterGridModeDefault( ) +{ + return GridFilterConfig::LargestScaleFirst; +} + void Config::setFilterSorting( const std::string& text ) { - if( text == "up" ) + if( stringIsame( text, "smallest" ) ) _grid_filter._mode = GridFilterConfig::SmallestScaleFirst; - else if( text == "down" ) + if( stringIsame( text, "s" ) ) + _grid_filter._mode = GridFilterConfig::SmallestScaleFirst; + else if( stringIsame( text, "largest" ) ) + _grid_filter._mode = GridFilterConfig::LargestScaleFirst; + else if( stringIsame( text, "l" ) ) _grid_filter._mode = GridFilterConfig::LargestScaleFirst; - else if( text == "random" ) - _grid_filter._mode = GridFilterConfig::RandomScale; else - POP_FATAL( "filter sorting mode must be one of up, down or random" ); + POP_FATAL( string("bad grid filter mode, ") + getFilterGridModeUsage() ); } void Config::setFilterSorting( GridFilterConfig::Mode m ) @@ -386,7 +401,7 @@ bool Config::equal( const Config& other ) const GridFilterConfig::GridFilterConfig( ) : _max_extrema( -1 ) , _size( 2 ) - , _mode( RandomScale ) + , _mode( LargestScaleFirst ) { } }; // namespace popsift diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h index 47151f85..2932442c 100644 --- a/src/popsift/sift_conf.h +++ b/src/popsift/sift_conf.h @@ -35,8 +35,6 @@ struct GridFilterConfig */ enum Mode { - /// keep a random selection - RandomScale, /// keep those with the largest scale LargestScaleFirst, /// keep those with the smallest scale @@ -263,6 +261,9 @@ struct Config // void setDPOrientation( bool on ); void setFilterMaxExtrema( int extrema ); void setFilterGridSize( int sz ); + + static GridFilterConfig::Mode getFilterGridModeDefault( ); + static const char* getFilterGridModeUsage( ); void setFilterSorting( const std::string& direction ); void setFilterSorting( GridFilterConfig::Mode m ); From c21d399fd1cdaaaca86742378823dafe3f7bb2b8 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sat, 27 Feb 2021 11:59:30 +0100 Subject: [PATCH 12/30] [popsift] move InitialExtremum to own file, clean up --- src/CMakeLists.txt | 1 + src/popsift/filtergrid.cu | 10 ++-- src/popsift/initial_extremum.h | 94 ++++++++++++++++++++++++++++++++++ src/popsift/s_extrema.cu | 17 ++---- src/popsift/s_orientation.cu | 18 +++---- src/popsift/sift_constants.h | 2 + src/popsift/sift_extremum.h | 36 +------------ src/popsift/sift_pyramid.h | 1 + 8 files changed, 118 insertions(+), 61 deletions(-) create mode 100755 src/popsift/initial_extremum.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fd4979ec..ea98eaee 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_conf.cu popsift/sift_conf.h popsift/gauss_filter.cu popsift/gauss_filter.h popsift/filtergrid.cu popsift/filtergrid.h + popsift/initial_extremum.h popsift/s_image.cu popsift/s_image.h popsift/sift_pyramid.cu popsift/sift_pyramid.h popsift/sift_octave.cu popsift/sift_octave.h diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 6b4435a8..315b4419 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -78,8 +78,8 @@ fg_init( const int octave, InitialExtremum* ie = &buf->i_ext_dat[octave][my_idx_in_octave]; - cells[my_idx] = ie->cell; - scales[my_idx] = ie->scale; + cells[my_idx] = ie->getCell(); + scales[my_idx] = ie->getScale(); /* Keep pointers in this pointer array. The indirection is not elegant for CUDA, * we can improve that later by making additional arrays for .cell and .scale @@ -289,7 +289,7 @@ void FilterGrid::prune_extrema( GridFilterConfig::Mode mode ) { const int sorted_index = _sorting_index[base+d]; InitialExtremum* ie = _initial_extrema_pointers[sorted_index]; - ie->ignore = true; + ie->setIgnore(); } } else // mode == GridFilterConfig::SmallestScaleFirst @@ -299,7 +299,7 @@ void FilterGrid::prune_extrema( GridFilterConfig::Mode mode ) { const int sorted_index = _sorting_index[base-d-1]; InitialExtremum* ie = _initial_extrema_pointers[sorted_index]; - ie->ignore = true; + ie->setIgnore(); } } } @@ -335,7 +335,7 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* for( int i=0; ii_ext_dat[o][i]; - if( iext->ignore == false ) + if( iext->isIgnored() == false ) { buf->i_ext_off[o][counter] = i; counter += 1; diff --git a/src/popsift/initial_extremum.h b/src/popsift/initial_extremum.h new file mode 100755 index 00000000..84e26cfa --- /dev/null +++ b/src/popsift/initial_extremum.h @@ -0,0 +1,94 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +// #include "sift_constants.h" + +namespace popsift { + +/** + * @brief This is an internal data structure. + * Separated from the final Extremum data structure to implement + * grid filtering in a space-efficient manner. In grid filtering, + * extrema are first found, after that some may be discarded in + * some spatial regions of the image. Avoid waste of space by + * allocating Extremum structures only for the remaining ones. + */ +class InitialExtremum +{ +public: + __device__ inline + void set( const int& oct, const float& x, const float& y, const float& z, const int& cellidx ) + { + ignore = false; + octave = oct; + xpos = x; + ypos = y; + if( z == 0 ) + { + lpos = 0; + sigma = 0.0f; + } + else + { + lpos = (int)roundf(z); + sigma = d_consts.sigma0 * pow(d_consts.sigma_k, z); + } + + cell = cellidx; + } + + __device__ inline const int& getOctave( ) const { return octave; } + __device__ inline const float& getX( ) const { return xpos; } + __device__ inline const float& getY( ) const { return ypos; } + __device__ inline const int& getLevel( ) const { return lpos; } + __device__ inline const float& getSigma( ) const { return sigma; } + __device__ inline const int& getCell( ) const { return cell; } + + /** Compute the accurate floating-point level within the image + * pyramid. It does not compensate for up- and downscaling of the + * input image but returns the value relative to the first octave + * that is actually computed. + * note: Online used once in grid filtering. Therefore not pre-computed. + */ + __device__ inline float getScale( ) const { + return sigma * powf( 2.0f, octave ); + } + + /** Set the ignore flag. Only used in grid filtering. */ + __host__ __device__ inline void setIgnore( ) { ignore = true; } + + /** Test the ignore flag. Only used in grid filtering. */ + __host__ __device__ inline bool isIgnored( ) const { return ignore; } + +private: + /// true if this extremum has been filtered + bool ignore; + + /** The octave where this point was found. */ + int octave; + + float xpos; + float ypos; + + /** This is the closest level of the octave for this initial + * extremum. It is rounded from the computed continuous level. + */ + int lpos; + + /** This is the accurate floating-point level within this octave. + * It requires a transformation of the "z" value that is derived + * from refining the z-position in the octave. + */ + float sigma; + + /// index into the grid for grid-based extrema filtering + int cell; +}; + +} // namespace popsift diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index e7541e7a..0e1ab18b 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -308,10 +308,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, int grid_width, InitialExtremum& ec) { - ec.xpos = 0.0f; - ec.ypos = 0.0f; - ec.lpos = 0; - ec.sigma = 0.0f; + ec.set( octave, 0.0f, 0.0f, 0.0f, 0 ); /* * First consideration: extrema cannot be found on any outermost edge, @@ -499,13 +496,9 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, } } - ec.xpos = xn; - ec.ypos = yn; - ec.lpos = (int)roundf(sn); - ec.sigma = d_consts.sigma0 * pow(d_consts.sigma_k, sn); // * 2; - ec.scale = ec.sigma * powf( 2.0f, octave ); - ec.cell = floorf( yn / h_grid_divider ) * grid_width + floorf( xn / w_grid_divider ); - // const float sigma_k = powf(2.0f, 1.0f / levels ); + ec.set( octave, + xn, yn, sn, + floorf( yn / h_grid_divider ) * grid_width + floorf( xn / w_grid_divider ) ); return true; } @@ -527,7 +520,6 @@ void find_extrema_in_dog( cudaTextureObject_t dog, const int grid_width ) { InitialExtremum ec; - ec.ignore = false; bool indicator = find_extrema_in_dog_sub( dog, octave, @@ -545,7 +537,6 @@ void find_extrema_in_dog( cudaTextureObject_t dog, int* d_ext_off = buf->i_ext_off[octave]; if( indicator && write_index < d_consts.max_extrema ) { - ec.write_index = write_index; // store the initial extremum in an array d_extrema[write_index] = ec; diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index 0c7a6dfe..ab8f2261 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -75,10 +75,10 @@ void ori_par( const int octave, hist[threadIdx.x+32] = 0.0f; /* keypoint fractional geometry */ - const float x = iext->xpos; - const float y = iext->ypos; - const int level = iext->lpos; // old_level; - const float sig = iext->sigma; + const float& x = iext->getX(); + const float& y = iext->getY(); + const int& level = iext->getLevel(); + const float& sig = iext->getSigma(); /* orientation histogram radius */ const float sigw = ORI_WINFACTOR * sig; @@ -215,11 +215,11 @@ void ori_par( const int octave, int angles = __popc( popsift::ballot( written ) ); if( threadIdx.x == 0 ) { - ext->xpos = iext->xpos; - ext->ypos = iext->ypos; - ext->lpos = iext->lpos; - ext->sigma = iext->sigma; - ext->octave = octave; + ext->xpos = iext->getX(); + ext->ypos = iext->getY(); + ext->lpos = iext->getLevel(); + ext->sigma = iext->getSigma(); + ext->octave = iext->getOctave(); ext->num_ori = angles; } } diff --git a/src/popsift/sift_constants.h b/src/popsift/sift_constants.h index 20b3012d..305e8658 100755 --- a/src/popsift/sift_constants.h +++ b/src/popsift/sift_constants.h @@ -9,6 +9,8 @@ #include +#include "sift_config.h" + #ifndef INF #define INF (1<<29) #endif diff --git a/src/popsift/sift_extremum.h b/src/popsift/sift_extremum.h index 0af257cc..138ef865 100755 --- a/src/popsift/sift_extremum.h +++ b/src/popsift/sift_extremum.h @@ -9,43 +9,11 @@ #include "sift_constants.h" -#include -#include +// #include +// #include namespace popsift { -/** - * @brief This is an internal data structure. - * Separated from the final Extremum data structure to implement - * grid filtering in a space-efficient manner. In grid filtering, - * extrema are first found, after that some may be discarded in - * some spatial regions of the image. Avoid waste of space by - * allocating Extremum structures only for the remaining ones. - */ -struct InitialExtremum -{ - float xpos; - float ypos; - /// extremum refined into this level - int lpos; - /// scale - this is the accurate floating-point level within this octave - float sigma; - - /// index into the grid for grid-based extrema filtering - int cell; - /// scale is sigma * powf( 2.0f, octave ) - sigma in the entire pyramid - float scale; - - /** during histogram creation, assign a unique number to this - * specific extremum in the cell (using the result of atomicAdd) - */ - int rank_in_cell; - /// true if this extremum has been filtered - bool ignore; - /// if any initial extrema are ignored, new index for Extremum - int write_index; -}; - /** * @brief This is an internal data structure. * For performance reasons, it would be appropriate to split diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 3bb7f7a9..5065ccfa 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -13,6 +13,7 @@ #include "sift_conf.h" #include "sift_constants.h" #include "sift_octave.h" +#include "initial_extremum.h" #include #include From 79de1d52e3b47008754e4358d34056ed93a57d65 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sun, 28 Feb 2021 11:17:45 +0100 Subject: [PATCH 13/30] [popsift] simplify extrema prefix sum in ExtremaCounters --- src/popsift/filtergrid.cu | 20 ++++++------- src/popsift/s_desc_grid.cu | 2 +- src/popsift/s_desc_igrid.cu | 2 +- src/popsift/s_desc_iloop.cu | 2 +- src/popsift/s_desc_loop.cu | 2 +- src/popsift/s_desc_notile.cu | 2 +- src/popsift/s_desc_vlfeat.cu | 2 +- src/popsift/s_orientation.cu | 51 +++++++++----------------------- src/popsift/sift_desc.cu | 8 +++--- src/popsift/sift_pyramid.cu | 24 ++++++++-------- src/popsift/sift_pyramid.h | 56 ++++++++++++++++++++++++++++++------ 11 files changed, 92 insertions(+), 79 deletions(-) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 315b4419..b4b86e5b 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -70,7 +70,7 @@ fg_init( const int octave, if( my_idx_in_octave > extrema_in_octave ) return; - const int base_index = ct->ext_ps[octave]; + const int base_index = ct->getExtremaBase( octave ); const int my_idx = base_index + my_idx_in_octave; /* This is the array we are going to use for sorting and filtering and so on. */ @@ -95,7 +95,7 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) _ct->make_extrema_prefix_sums(); - if( _alloc_size < _ct->ext_total ) + if( _alloc_size < _ct->getTotalExtrema() ) { if( _alloc_size > 0 ) { @@ -104,7 +104,7 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) cudaFree( _scales ); cudaFree( _initial_extrema_pointers ); } - _alloc_size = _ct->ext_total; + _alloc_size = _ct->getTotalExtrema(); _sorting_index = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); _cells = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); _scales = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); @@ -160,11 +160,11 @@ fg_countcells( const int ext_total, void FilterGrid::count_cells( ) { dim3 block( 32 ); - dim3 grid( grid_divide( _ct->ext_total, block.x ) ); + dim3 grid( grid_divide( _ct->getTotalExtrema(), block.x ) ); fg_countcells <<>> - ( _ct->ext_total, + ( _ct->getTotalExtrema(), _cells, _slots, _histogram_full ); @@ -216,10 +216,10 @@ void FilterGrid::sort_by_cell_and_scale( ) #if 0 cudaDeviceSynchronize(); int* ptr = _sorting_index; - std::sort( ptr, ptr + _ct->ext_total, tc ); + std::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); #else thrust::device_ptr ptr = thrust::device_pointer_cast( _sorting_index ); - thrust::sort( ptr, ptr + _ct->ext_total, tc ); + thrust::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); #endif } @@ -327,7 +327,6 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* level_histogram( max_extrema ); prune_extrema( conf.getFilterSorting() ); - int filtered_ext_total = 0; for( int o=0; oext_ct[o]; @@ -342,10 +341,11 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* } } ct->ext_ct[o] = counter; - filtered_ext_total += counter; } - return filtered_ext_total; + ct->make_extrema_prefix_sums(); + + return ct->getTotalExtrema(); } }; // namespace popsift diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu index 8ecea81a..c1525166 100644 --- a/src/popsift/s_desc_grid.cu +++ b/src/popsift/s_desc_grid.cu @@ -169,7 +169,7 @@ bool start_ext_desc_grid( const ExtremaCounters* ct, ExtremaBuffers* buf, const <<>> ( buf, octave, - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexPoint( ) ); POP_SYNC_CHK; diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu index 2d8119fb..3424a968 100644 --- a/src/popsift/s_desc_igrid.cu +++ b/src/popsift/s_desc_igrid.cu @@ -132,7 +132,7 @@ bool start_ext_desc_igrid( const ExtremaCounters* ct, ExtremaBuffers* buf, const ext_desc_igrid <<>> ( buf, - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexLinear( ).tex ); POP_SYNC_CHK; diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu index a7e67252..3abd517e 100644 --- a/src/popsift/s_desc_iloop.cu +++ b/src/popsift/s_desc_iloop.cu @@ -168,7 +168,7 @@ bool start_ext_desc_iloop( const ExtremaCounters* ct, ExtremaBuffers* buf, const <<>> ( buf, octave, - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexLinear( ).tex, oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu index f57cdabe..99b7790f 100644 --- a/src/popsift/s_desc_loop.cu +++ b/src/popsift/s_desc_loop.cu @@ -186,7 +186,7 @@ bool start_ext_desc_loop( const ExtremaCounters* ct, ExtremaBuffers* buf, const <<>> ( buf, octave, - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexPoint( ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu index 861d11aa..992b0810 100644 --- a/src/popsift/s_desc_notile.cu +++ b/src/popsift/s_desc_notile.cu @@ -149,7 +149,7 @@ bool start_ext_desc_notile( const ExtremaCounters* ct, ExtremaBuffers* buf, int <<>> ( buf, ct->ori_ct[octave], - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexLinear( ).tex ); cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError( ); diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 6aff7e48..350a8198 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -202,7 +202,7 @@ bool start_ext_desc_vlfeat( const ExtremaCounters* ct, ExtremaBuffers* buf, cons ext_desc_vlfeat <<>> ( buf, - ct->ori_ps[octave], + ct->getOrientationBase( octave ), oct_obj.getDataTexPoint( ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index ab8f2261..dc1d3029 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -58,7 +58,7 @@ void ori_par( const int octave, const int w, const int h ) { - const int& ext_ct_prefix_sum = ct->ext_ps[octave]; + const int& ext_ct_prefix_sum = ct->getExtremaBase( octave ); const int extremum_index = blockIdx.x * blockDim.y; if( popsift::all( extremum_index >= ct->ext_ct[octave] ) ) return; // a few trailing warps @@ -284,7 +284,7 @@ public: }; __global__ -void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf, const int total_ext_ct, const int num_octaves ) +void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf ) { int total_ori = 0; Extremum* extremum = buf->extrema; @@ -294,25 +294,20 @@ void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf, const int total_e ExtremaWrt w( extremum ); ExtremaTot t( total_ori ); ExtremaWrtMap wrtm( feat_to_ext_map, max( d_consts.max_orientations, buf->ori_allocated ) ); - ExclusivePrefixSum::Block( total_ext_ct, r, w, t, wrtm ); + ExclusivePrefixSum::Block( ct->getTotalExtrema(), r, w, t, wrtm ); __syncthreads(); if( threadIdx.x == 0 && threadIdx.y == 0 ) { - ct->make_extrema_prefix_sums( ); - /* replaced by the __device__ method above: - ct->ext_ps[0] = 0; - for( int o=1; oext_ps[o] = ct->ext_ps[o-1] + ct->ext_ct[o-1]; - } - */ + // this is actually an input to the function + // ct->make_extrema_prefix_sums( ); for( int o=0; oext_ct[o] == 0 ) { ct->ori_ct[o] = 0; } else { - int fe = ct->ext_ps[o ]; /* first extremum for this octave */ - int le = ct->ext_ps[o+1]-1; /* last extremum for this octave */ + int fe = ct->getExtremaBase(o ); /* first extremum for this octave */ + int le = ct->getExtremaBase(o+1)-1; /* last extremum for this octave */ int lo_ori_index = buf->extrema[fe].idx_ori; int num_ori = buf->extrema[le].num_ori; int hi_ori_index = buf->extrema[le].idx_ori + num_ori; @@ -320,13 +315,7 @@ void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf, const int total_e } } - ct->ori_ps[0] = 0; - for( int o=1; oori_ps[o] = ct->ori_ps[o-1] + ct->ori_ct[o-1]; - } - - ct->ori_total = ct->ori_ps[MAX_OCTAVES-1] + ct->ori_ct[MAX_OCTAVES-1]; - ct->ext_total = ct->ext_ps[MAX_OCTAVES-1] + ct->ext_ct[MAX_OCTAVES-1]; + ct->make_orientation_prefix_sums( ); } } @@ -335,30 +324,16 @@ void Pyramid::orientation( const Config& conf ) { readDescCountersFromDevice( ); - int ext_total = 0; - for(int o : _ct->ext_ct) - { - if( o > 0 ) - { - ext_total += o; - } - } + _ct->make_extrema_prefix_sums(); // Filter functions are only called if necessary. They are very expensive, // therefore add 10% slack. - if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < ext_total ) + if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < _ct->getTotalExtrema() ) { - ext_total = _fg.filter( conf, _ct, _buf, ext_total ); + _fg.filter( conf, _ct, _buf, _ct->getTotalExtrema() ); } - reallocExtrema( ext_total ); - - int ext_ct_prefix_sum = 0; - for( int octave=0; octave<_num_octaves; octave++ ) { - _ct->ext_ps[octave] = ext_ct_prefix_sum; - ext_ct_prefix_sum += _ct->ext_ct[octave]; - } - _ct->ext_total = ext_ct_prefix_sum; + reallocExtrema( _ct->getTotalExtrema() ); cudaStream_t oct_0_str = _octaves[0].getStream(); @@ -404,7 +379,7 @@ void Pyramid::orientation( const Config& conf ) grid.x = 1; ori_prefix_sum <<>> - ( _ct, _buf, ext_ct_prefix_sum, _num_octaves ); + ( _ct, _buf ); POP_SYNC_CHK; cudaDeviceSynchronize(); diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index 5fb81c89..ecda3571 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -88,7 +88,7 @@ void Pyramid::descriptors( const Config& conf ) } } - if( _ct->ori_total == 0 ) + if( _ct->getTotalOrientations() == 0 ) { cerr << "Warning: no descriptors extracted" << endl; return; @@ -100,19 +100,19 @@ void Pyramid::descriptors( const Config& conf ) block.z = 1; dim3 grid; - grid.x = grid_divide( _ct->ori_total, block.y ); + grid.x = grid_divide( _ct->getTotalOrientations(), block.y ); if( conf.getUseRootSift() ) { normalize_histogram <<>> ( _buf->desc, - _ct->ori_total ); + _ct->getTotalOrientations() ); POP_SYNC_CHK; } else { normalize_histogram <<>> ( _buf->desc, - _ct->ori_total ); + _ct->getTotalOrientations() ); POP_SYNC_CHK; } diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index 5dbf5f13..e8c6a6ce 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -185,7 +185,7 @@ __global__ void prep_features( Descriptor* descriptor_base, ExtremaCounters* ct, ExtremaBuffers* buf, int up_fac ) { int offset = blockIdx.x * 32 + threadIdx.x; - if( offset >= ct->ext_total ) return; + if( offset >= ct->getTotalExtrema() ) return; const Extremum& ext = buf->extrema [offset]; Feature& fet = buf->features[offset]; @@ -220,15 +220,15 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) readDescCountersFromDevice(); nvtxRangePushA( "download descriptors" ); - FeaturesHost* features = new FeaturesHost( _ct->ext_total, _ct->ori_total ); + FeaturesHost* features = new FeaturesHost( _ct->getTotalExtrema(), _ct->getTotalOrientations() ); - if( _ct->ext_total == 0 || _ct->ori_total == 0 ) + if( _ct->getTotalExtrema() == 0 || _ct->getTotalOrientations() == 0 ) { nvtxRangePop(); return features; } - dim3 grid( grid_divide( _ct->ext_total, 32 ) ); + dim3 grid( grid_divide( _ct->getTotalExtrema(), 32 ) ); prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; @@ -237,13 +237,13 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) nvtxRangePop(); popcuda_memcpy_async( features->getFeatures(), _buf->features, - _ct->ext_total * sizeof(Feature), + _ct->getTotalExtrema() * sizeof(Feature), cudaMemcpyDeviceToHost, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), _buf->desc, - _ct->ori_total * sizeof(Descriptor), + _ct->getTotalOrientations() * sizeof(Descriptor), cudaMemcpyDeviceToHost, _download_stream ); cudaStreamSynchronize( _download_stream ); @@ -259,25 +259,25 @@ void Pyramid::clone_device_descriptors_sub( const Config& conf, FeaturesDev* fea { const float up_fac = conf.getUpscaleFactor(); - dim3 grid( grid_divide( _ct->ext_total, 32 ) ); + dim3 grid( grid_divide( _ct->getTotalExtrema(), 32 ) ); prep_features<<>>( features->getDescriptors(), _ct, _buf, up_fac ); POP_SYNC_CHK; popcuda_memcpy_async( features->getFeatures(), _buf->features, - _ct->ext_total * sizeof(Feature), + _ct->getTotalExtrema() * sizeof(Feature), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getDescriptors(), _buf->desc, - _ct->ori_total * sizeof(Descriptor), + _ct->getTotalOrientations() * sizeof(Descriptor), cudaMemcpyDeviceToDevice, _download_stream ); popcuda_memcpy_async( features->getReverseMap(), _buf->feat_to_ext_map, - _ct->ori_total * sizeof(int), + _ct->getTotalOrientations() * sizeof(int), cudaMemcpyDeviceToDevice, _download_stream ); } @@ -286,7 +286,7 @@ FeaturesDev* Pyramid::clone_device_descriptors( const Config& conf ) { readDescCountersFromDevice(); - FeaturesDev* features = new FeaturesDev( _ct->ext_total, _ct->ori_total ); + FeaturesDev* features = new FeaturesDev( _ct->getTotalExtrema(), _ct->getTotalOrientations() ); clone_device_descriptors_sub( conf, features ); @@ -339,7 +339,7 @@ void Pyramid::writeDescriptor( const Config& conf, ostream& ostr, FeaturesHost* const float up_fac = conf.getUpscaleFactor(); - for( int ext_idx = 0; ext_idx<_ct->ext_total; ext_idx++ ) { + for( int ext_idx = 0; ext_idx<_ct->getTotalExtrema(); ext_idx++ ) { const Feature& ext = features->getFeatures()[ext_idx]; const int octave = ext.debug_octave; const float xpos = ext.xpos * pow(2.0f, octave - up_fac); diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 5065ccfa..4751b814 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -30,14 +30,13 @@ struct ExtremaCounters /* The number of orientation found per octave */ int ori_ct[MAX_OCTAVES]; +private: /* Exclusive prefix sum of ext_ct */ - int ext_ps[MAX_OCTAVES]; + int ext_ps[MAX_OCTAVES+1]; /* Exclusive prefix sum of ori_ct */ - int ori_ps[MAX_OCTAVES]; - - int ext_total; - int ori_total; + int ori_ps[MAX_OCTAVES+1]; +public: /** host and device function helper function that updates the exclusive * prefix sum for extrema counts, updates ext_total and returns the total * number of extrema. @@ -48,14 +47,53 @@ struct ExtremaCounters int make_extrema_prefix_sums( ) { ext_ps[0] = 0; - for( int o=1; o Date: Sun, 7 Mar 2021 09:09:29 +0100 Subject: [PATCH 14/30] [testing] scripted comparison with vlfeat --- CHANGES.md | 1 + src/CMakeLists.txt | 4 + src/compareSiftFiles/CMakeLists.txt | 38 +++ src/compareSiftFiles/compareSiftFiles.cpp | 131 ++++++++++ src/compareSiftFiles/csf_feat.cpp | 240 ++++++++++++++++++ src/compareSiftFiles/csf_feat.h | 38 +++ src/compareSiftFiles/csf_options.cpp | 100 ++++++++ src/compareSiftFiles/csf_options.h | 21 ++ testScripts/CMakeLists.txt | 3 + testScripts/compareVersions/CMakeLists.txt | 46 ++++ testScripts/compareVersions/Compare.mk.in | 14 + testScripts/compareVersions/awk/compute.awk | 18 ++ .../compareVersions/awk/desc-to-heat.awk | 23 ++ .../compareVersions/compareWithVLFeat.sh.in | 191 ++++++++++++++ 14 files changed, 868 insertions(+) create mode 100755 src/compareSiftFiles/CMakeLists.txt create mode 100755 src/compareSiftFiles/compareSiftFiles.cpp create mode 100755 src/compareSiftFiles/csf_feat.cpp create mode 100755 src/compareSiftFiles/csf_feat.h create mode 100755 src/compareSiftFiles/csf_options.cpp create mode 100755 src/compareSiftFiles/csf_options.h create mode 100755 testScripts/compareVersions/CMakeLists.txt create mode 100755 testScripts/compareVersions/Compare.mk.in create mode 100644 testScripts/compareVersions/awk/compute.awk create mode 100644 testScripts/compareVersions/awk/desc-to-heat.awk create mode 100755 testScripts/compareVersions/compareWithVLFeat.sh.in diff --git a/CHANGES.md b/CHANGES.md index 9608b25d..4a5ee214 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for Cuda CC 7 cards (RTX 2080) [PR](https://github.com/alicevision/popsift/pull/67) - Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65) - Support for device selection and multiple GPUs [PR](https://github.com/alicevision/popsift/pull/121) +- Test: adding descriptor comparator [PR](https://github.com/alicevision/popsift/pull/115) ### Fixed - CMake: fixes to allow building on Windows using vcpkg [PR](https://github.com/alicevision/popsift/pull/92) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index db7aeadc..1f6bf77c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -150,3 +150,7 @@ if(PopSift_BUILD_EXAMPLES) add_subdirectory(application) endif() +if(PopSift_USE_TEST_CMD) + add_subdirectory(compareSiftFiles) +endif() + diff --git a/src/compareSiftFiles/CMakeLists.txt b/src/compareSiftFiles/CMakeLists.txt new file mode 100755 index 00000000..1eb1d625 --- /dev/null +++ b/src/compareSiftFiles/CMakeLists.txt @@ -0,0 +1,38 @@ +if(NOT CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR) + # I am top-level project, i.e. I am not being include by another project + cmake_minimum_required(VERSION 3.12) + project(PopSiftCompareDescriptors LANGUAGES CXX) + + include(GNUInstallDirs) + + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") +endif() + +if(PopSift_BOOST_USE_STATIC_LIBS) + set(Boost_USE_STATIC_LIBS ON) +endif() +find_package(Boost 1.53.0 REQUIRED COMPONENTS filesystem program_options system) +if(WIN32) + add_definitions("-DBOOST_ALL_NO_LIB") +endif(WIN32) + +############################################################# +# compareSiftFiles +############################################################# + +add_executable(popsift-compareSiftFiles + compareSiftFiles.cpp + csf_feat.cpp csf_feat.h + csf_options.cpp csf_options.h) + +target_include_directories(popsift-compareSiftFiles PUBLIC ${Boost_INCLUDE_DIRS}) +target_link_libraries(popsift-compareSiftFiles PUBLIC ${Boost_LIBRARIES}) + +set_property(TARGET popsift-compareSiftFiles PROPERTY CXX_STANDARD 14) + +############################################################# +# installation +############################################################# + +install(TARGETS popsift-compareSiftFiles DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/src/compareSiftFiles/compareSiftFiles.cpp b/src/compareSiftFiles/compareSiftFiles.cpp new file mode 100755 index 00000000..b52ad9a7 --- /dev/null +++ b/src/compareSiftFiles/compareSiftFiles.cpp @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "csf_feat.h" +#include "csf_options.h" + +using namespace std; + +typedef vector desc_t; + +/** Read a file containing SIFT descriptors + */ +void readFile( vector& vec, const std::string& filename ); + +/** Write the average descriptor differences to file + * @param file The output file + * @param stats A 128-float vector containing the sum of differences + * @param sz The number of samples that have been collected in stats + */ +void writeSummary( ostream& file, const vector& stats, int sz ); + +/** Open a new stream or return the default stream. The default stream is + * returned if the name is empty or opening the stream fails. + * @param name A string containing the name of the file to open or empty + * @param default_stream The default stream to return + */ +ostream* openOutputFile( const string& name, ostream* default_stream ); + +int main( int argc, char* argv[] ) +{ + Parameters param; + + parseargs( argc, argv, param ); + + vector l_one; + vector l_two; + + readFile( l_one, param.input[0] ); + readFile( l_two, param.input[1] ); + + ostream* outfile = openOutputFile( param.outfile_name, &cout ); + ostream* descfile = openOutputFile( param.descfile_name, nullptr ); + + int len = l_one.size(); + int ct = 0; + float nextpercent = 10; + + vector desc_stats( 128, 0.0f ); + + ostream* print_dists = param.descfile_verbose ? descfile : 0; + + for( auto l : l_one ) + { + l.compareBestMatch( *outfile, print_dists, l_two, desc_stats, param.briefinfo ); + ct++; + if( float(ct * 100) / len >= float(nextpercent) ) + { + cerr << nextpercent << "% " << ct << endl; + nextpercent += 10; + } + } + + if( descfile ) + { + writeSummary( *descfile, desc_stats, l_one.size() ); + } + + if( ! param.outfile_name.empty() ) + { + delete outfile; + } +} + +void writeSummary( ostream& descfile, const vector& desc_stats, int sz ) +{ + descfile << "========== Summary Stats ==========" << endl + << "Average values:" << endl + << setprecision(3); + for( int i=0; i<128; i++ ) + { + if( i%32==0 ) descfile << "X=0 | "; + if( i%32==8 ) descfile << "X=1 | "; + if( i%32==16 ) descfile << "X=2 | "; + if( i%32==24 ) descfile << "X=3 | "; + float d = desc_stats[i] / sz; + descfile << setw(8) << d << " "; + if( i%8==7 ) descfile << endl; + } + descfile << endl; +} + +void readFile( vector& vec, const std::string& filename ) +{ + ifstream infile( filename ); + + if( ! infile.good() ) + { + cerr << "File " << filename << " is not open." << endl; + exit( -1 ); + } + + int lines_read = readFeats( vec, infile ); + cerr << "Read " << lines_read << " lines from " << filename << endl; +} + +ostream* openOutputFile( const string& outfile_name, ostream* default_stream ) +{ + ostream* outfile = default_stream; + if( outfile_name.empty() ) return outfile; + + ostream* o = new ofstream( outfile_name ); + if( o->good() ) + { + outfile = o; + } + else + { + delete o; + } + + return outfile; +} + diff --git a/src/compareSiftFiles/csf_feat.cpp b/src/compareSiftFiles/csf_feat.cpp new file mode 100755 index 00000000..2dfe067b --- /dev/null +++ b/src/compareSiftFiles/csf_feat.cpp @@ -0,0 +1,240 @@ +#include +#include +#include +#include + +#include "csf_feat.h" + +#if __cplusplus >= 201703L +#define PAR_UNSEQ std::execution::par_unseq, +#else +#define PAR_UNSEQ +#endif + +#undef HAVE_STD_TRANSFORM_REDUCE + +using namespace std; + +const float M_PI2 = 2.0f * 3.14159265358979323846f; + +bool feat_t::_use_l2_distance = true; + +static float dist( const feat_t& l, const feat_t& r ) +{ + if( feat_t::_use_l2_distance ) + { +#ifdef HAVE_STD_TRANSFORM_REDUCE +/* From C++17 numeric: +template +T transform_reduce(ExecutionPolicy&& policy, + ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, + T init, BinaryOp1 binary_op1, BinaryOp2 binary_op2); +*/ + const float sum = std::transform_reduce( + PAR_UNSEQ + l.desc, l.desc+DESC_SIZE, + r.desc, + 0.0f, + std::plus<>(), + [](float l, float r){ float v=l-r; return (v*v); }, + std::plus ); +#else + float sum = 0.0f; + + for( int i=0; i(), + [](float l, float r){ float v=l-r; return fabsf(v); }, + std::plus ); +#else + float sum = 0.0f; + + for( int i=0; i& l_one, ifstream& f_one ) +{ + char buffer[1024]; + int lines_read; + + lines_read = 0; + while( f_one.good() ) + { + f_one.getline( buffer, 1024 ); + if( f_one.good() ) + { + bool success = addFeat( l_one, buffer ); + if( success ) + { + lines_read++; + } + } + } + return lines_read; +} + +bool addFeat( vector& features, char* line ) +{ + vector values(5+DESC_SIZE); // 4 or 5 values followed by DESC_SIZE desc values + + int i = 0; + istringstream s( line ); + while( s >> values[i] ) + { + i++; + } + + // cerr << "Found " << i << " floats in line" << endl; + features.emplace_back( i, values ); + + if( i == 0 ) return false; + return true; +} + +feat_t::feat_t( int num, const vector& input ) + : desc(DESC_SIZE) +{ + auto it = input.begin(); + auto to = desc.begin(); + if( num == DESC_SIZE+4 ) + { + x = *it++; + y = *it++; + sigma = *it++; + ori = *it++; + for( int i=0; i& l_one, vector& desc_stats, bool minOnly ) const +{ + const int l_one_sz = l_one.size(); + + vector distances( l_one_sz ); + + if( !minOnly ) ostr << "==========" << endl; + + const feat_t& left( *this ); + + for( int j=0; j& left = desc; + + for( int j=0; j *m ) + { + second = min( second, *it ); + } + it++; + } + + ostr << " 2best " << second << endl; + } + else + { + auto it = distances.begin(); + + for( int j=0; j +#include +#include + +#define DESC_SIZE 128 + +typedef std::vector desc_t; + +class feat_t +{ +public: + static bool _use_l2_distance; + + float x; + float y; + float sigma; + float ori; + desc_t desc; + + feat_t( int num, const std::vector& input ); + + void print( std::ostream& ostr ) const; + + void compareBestMatch( std::ostream& ostr, + std::ostream* dstr, + const std::vector& l_one, + std::vector& desc_stats, + bool minOnly ) const; + + static void setL2Distance( bool onoff ); +}; + +int readFeats( std::vector& l_one, + std::ifstream& f_one ); +bool addFeat( std::vector& features, char* line ); + diff --git a/src/compareSiftFiles/csf_options.cpp b/src/compareSiftFiles/csf_options.cpp new file mode 100755 index 00000000..922916ea --- /dev/null +++ b/src/compareSiftFiles/csf_options.cpp @@ -0,0 +1,100 @@ +#include + +#include + +#include "csf_options.h" + +using namespace std; + +void parseargs( int argc, char** argv, Parameters& param ) +{ + using namespace boost::program_options; + + bool longinfo = false; + + positional_options_description positional; + positional.add( "input", -1 ); + + options_description hidden_options("Hidden options"); + options_description options("Options"); + + { + hidden_options.add_options() + ("input,i", value >()->multitoken()->composing(), + "A descriptor file. 2 descriptor files must be passed"); + + options.add_options() + ("help,h", "Print usage") + ("verbose,v", value( &longinfo )->default_value(false), + "verbose, longer output") + ("output,o", value(¶m.outfile_name), + "print essential diff info to (default is cout)") + ("desc,d", value(¶m.descfile_name), + "print descriptor distance per cell to ") + ("dv", value(¶m.descfile_verbose)->default_value(false), + "prints every comparison into descfile, instead of summary only") + ("l1", bool_switch()->notifier( [&](bool b) + { + feat_t::setL2Distance( !b ); + } ) + ->default_value( false ), + "use L1 for distance computation instead of L2" ) + ; + } + + options_description all("Allowed options"); + + all.add(hidden_options) + .add(options); + + variables_map vm; + + try + { + + store( command_line_parser(argc, argv).options( all ) + .positional( positional ) + .run( ), + vm ); + + param.input = vm["input"].as >(); + + if( vm.count("help") ) + { + usage( argv[0], options ); + exit(EXIT_SUCCESS); + } + + if( param.input.size() != 2 ) + { + cerr << "Error: Exactly 2 input file for comparison must be provided." << endl; + usage( argv[0], options ); + exit(EXIT_FAILURE); + } + + notify(vm); // Notify does processing (e.g., raise exceptions if required args are missing) + } + catch(boost::program_options::error& e) + { + std::cerr << "Error: " << e.what() << std::endl << std::endl; + usage( argv[0], options ); + exit(EXIT_FAILURE); + } + + param.briefinfo = !longinfo; +} + +void usage( char* name, const boost::program_options::options_description& options ) +{ + cerr << endl + << "Usage: " << name << " [Options] " << endl + << endl + << "Description\n" + " Compute the L1 or L2 distance between the descriptors of closest\n" + " coordinate pairs.\n" + " When a coordinate has several orientations, the closest distance\n" + " is reported. Summary information at the end." << endl + << endl + << options << endl; +} + diff --git a/src/compareSiftFiles/csf_options.h b/src/compareSiftFiles/csf_options.h new file mode 100755 index 00000000..a82ae9c4 --- /dev/null +++ b/src/compareSiftFiles/csf_options.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +#include + +#include "csf_feat.h" + +struct Parameters +{ + std::vector input; + std::string outfile_name; + std::string descfile_name; + bool descfile_verbose; + bool briefinfo; +}; + +void parseargs( int argc, char** argv, Parameters& param ); +void usage( char* name, const boost::program_options::options_description& all ); + diff --git a/testScripts/CMakeLists.txt b/testScripts/CMakeLists.txt index 411a8e30..28185847 100755 --- a/testScripts/CMakeLists.txt +++ b/testScripts/CMakeLists.txt @@ -26,3 +26,6 @@ add_custom_target( DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh DEPENDS popsift-demo ) + +add_subdirectory(compareVersions) + diff --git a/testScripts/compareVersions/CMakeLists.txt b/testScripts/compareVersions/CMakeLists.txt new file mode 100755 index 00000000..43af4c8b --- /dev/null +++ b/testScripts/compareVersions/CMakeLists.txt @@ -0,0 +1,46 @@ +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") + +find_program(AWK awk mawk gawk) +find_program(SORT sort) +find_program(GNUPLOT gnuplot) +find_program(CONVERT convert) +find_program(VLFEAT sift PATH_SUFFIXES glnxa64 bin/glnxa64) + +if(GNUPLOT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: gnupot could not be found (${GNUPLOT}).") +else() + message(STATUS "gnuplot found (${GNUPLOT}).") +endif() + +if(CONVERT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: ImageMagick's convert could not be found (${CONVERT}).") +else() + message(STATUS "convert found (${CONVERT}).") +endif() + +if(VLFEAT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: vlfeat could not be found (${VLFEAT}).") +else() + message(STATUS "vlfeat found (${VLFEAT}).") +endif() + +if(SORT MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: sort could not be found (${SORT}). Compare tests will fail.") +else() + message(STATUS "sort found (${SORT}).") +endif() + +if(AWK MATCHES ".+-NOTFOUND") + message(WARNING "WARNING: awk (and mawk and gawk) could not be found (${AWK}). Compare tests will fail") +else() + message(STATUS "awk found (${AWK}).") +endif() + +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/compareWithVLFeat.sh.in + ${CMAKE_CURRENT_BINARY_DIR}/compareWithVLFeat.sh + @ONLY) + +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Compare.mk.in + Compare.mk + @ONLY) + diff --git a/testScripts/compareVersions/Compare.mk.in b/testScripts/compareVersions/Compare.mk.in new file mode 100755 index 00000000..fbe4b42c --- /dev/null +++ b/testScripts/compareVersions/Compare.mk.in @@ -0,0 +1,14 @@ +all: test + +test: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + sh ./compareWithVLFeat.sh + +clean: + rm -f sort*.txt + rm -f sort*.png + rm -f *.sift + rm -f descdist*.txt + rm -f descdist*.png + rm -f tmp/*.sift tmp/UML*.txt tmp/sort*.txt tmp/coord*.txt tmp/sum*.txt + rm -f tmp/descdist*.txt + diff --git a/testScripts/compareVersions/awk/compute.awk b/testScripts/compareVersions/awk/compute.awk new file mode 100644 index 00000000..aadd528d --- /dev/null +++ b/testScripts/compareVersions/awk/compute.awk @@ -0,0 +1,18 @@ +{ + printf("%f %f -- ", $1, $2); + total = 0 + for(i=5; i<=NF; i++) { total = total + $i; } + printf("%d -- ", total); + i = 5; + for(r=0; r<16; r++) + { + total = 0 + for(i=0; i<8; i++) + { + idx = 5 + r*8 + i + total = total + $idx; + } + printf("%d ", total/8); + } + printf("\n"); +} diff --git a/testScripts/compareVersions/awk/desc-to-heat.awk b/testScripts/compareVersions/awk/desc-to-heat.awk new file mode 100644 index 00000000..b84e7ef1 --- /dev/null +++ b/testScripts/compareVersions/awk/desc-to-heat.awk @@ -0,0 +1,23 @@ +BEGIN { + x = -1.5; + y = -1.5; +} + +{ + if( NF == 10 ) + { + printf("%.1f %.1f %f\n", x, y, $COL); + x = x + 1; + if( x > 2 ) + { + printf("\n"); + x = -1.5; + y = y + 1; + } + } +} + +END { + printf("\n"); +} + diff --git a/testScripts/compareVersions/compareWithVLFeat.sh.in b/testScripts/compareVersions/compareWithVLFeat.sh.in new file mode 100755 index 00000000..7018d490 --- /dev/null +++ b/testScripts/compareVersions/compareWithVLFeat.sh.in @@ -0,0 +1,191 @@ +#!/bin/bash + +CMD_VLF=@VLFEAT@ +CMD_POP=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-demo +CMD_AWK=@AWK@ +CMD_SORT=@SORT@ +CMD_GP=@GNUPLOT@ +CMD_CONVERT=@CONVERT@ +CMD_COMPARE=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles +DIR_AWK=@CMAKE_CURRENT_SOURCE_DIR@/awk +DIR_TEST=@PopSift_TESTFILE_PATH@ + +run_vlfeat() { + file="$1" + + echo ${CMD_VLF} -v ${file}.pgm + ${CMD_VLF} -v ${file}.pgm + ${CMD_SORT} -n ${file}.sift > tmp/${file}-vlfeat.sift + rm ${file}.sift + ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < tmp/${file}-vlfeat.sift > tmp/coord-${file}-vlfeat.txt + ${CMD_AWK} -f ${DIR_AWK}/compute.awk tmp/${file}-vlfeat.sift > tmp/sum-${file}-vlfeat.txt + echo "LEAVE VLFEAT" +} + +run_popsift() { + file="$1" + par="$2" + + # VLFeat multiplies the descriptors with 512 before converting to uchar, + # so we use 9 for a 2**9 multiplier as well. + PAR0="--pgmread-loading --norm-mode=classic --norm-multi 9 --write-as-uchar --write-with-ori --initial-blur 0.5 --sigma 1.6 --threshold 0 --edge-threshold 10.0" + + echo ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm + ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm + ${CMD_SORT} -n output-features.txt > tmp/${file}-popsift.sift + rm output-features.txt + ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < tmp/${file}-popsift.sift > tmp/coord-${file}-popsift.txt + ${CMD_AWK} -f ${DIR_AWK}/compute.awk tmp/${file}-popsift.sift > tmp/sum-${file}-popsift.txt + echo "LEAVE POPSIFT" +} + +mkdir -p tmp + +# for TESTNAME in default new twobins vlfeatdesc vlfeat +for TESTNAME in new vlfeatdesc +do + + rm -f hash.sift* + rm -f level1.sift* + rm -f coord-* + rm -f sum-* + + # FILES="hash level1 boat" + # FILES="level1" + # FILES="hash" + FILES="boat" + # FILES="boat1" + + for file in ${FILES} ; do + if [ ! -f ${file}.pgm ] + then + ${CMD_CONVERT} ${DIR_TEST}/$file/img1.ppm ${file}.pgm + if [ $? -ne 0 ] + then + echo "${CMD_CONVERT} failed, test image not converted" + exit -1 + fi + fi + + if [ "$TESTNAME" = "default" ]; then + echo "Test is default" + PAR1=" " + elif [ "$TESTNAME" = "new" ]; then + echo "Test is new: loopdescriptor and BestBin" + PAR1="--desc-mode=loop" + elif [ "$TESTNAME" = "vlfeatdesc" ]; then + echo "Test is vlfeatdesc: vlfeat descriptor and BestBin" + PAR1="--desc-mode=vlfeat" + else + echo "Test is undefined, $TESTNAME" + exit + fi + + run_vlfeat ${file} + + run_popsift ${file} ${PAR1} + + echo "Perform brute force matching" + echo ${CMD_COMPARE} ${file}-popsift.sift ${file}-vlfeat.sift + ${CMD_COMPARE} -o tmp/UML-${file}.txt \ + -d tmp/descdist-${file}-${TESTNAME}.txt \ + tmp/${file}-popsift.sift \ + tmp/${file}-vlfeat.sift + + echo "Sorting" + ${CMD_SORT} -k3 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-1st-match.txt + ${CMD_SORT} -k6 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-pixdist.txt + ${CMD_SORT} -k8 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-angle.txt + ${CMD_SORT} -k10 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-2nd-match.txt + + echo "Converting descriptor distance stats" + awk -vCOL=3 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt > descdist-${file}-${TESTNAME}.txt + awk -vCOL=4 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=5 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=6 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=7 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=8 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=9 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + awk -vCOL=10 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt + + echo "Calling gnuplot (pixdist)" + echo "set title \"L2 distance between pixels, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp + echo "set logscale y" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"sort-${file}-by-pixdist-${TESTNAME}.png\"" >> cmd.gp + echo "plot \"tmp/sort-${file}-by-pixdist.txt\" using (\$6+0.00001) notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (1st dist)" + echo "set title \"L2 distance between descriptors, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"/dev/null\"" >> cmd.gp + echo "plot \"tmp/sort-${file}-by-1st-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"tmp/sort-${file}-by-1st-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"sort-${file}-by-1st-match-${TESTNAME}.png\"" >> cmd.gp + echo "replot" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot for angular diff (1st dist)" + echo "set title \"Distance in degree between orientations, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set ylabel \"Difference (degree)\"" >> cmd.gp + echo "set xlabel \"Keypoint index sorted by orientation difference\"" >> cmd.gp + echo "set grid" >> cmd.gp + echo "set logscale y" >> cmd.gp + echo "set yrange [0.001:*]" >> cmd.gp + echo "set style data lines" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"sort-${file}-by-angle-${TESTNAME}.png\"" >> cmd.gp + echo "plot \"tmp/sort-${file}-by-angle.txt\" using 8 notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (2nd dist)" + echo "set title \"L2 distance between descriptors, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set xlabel \"Keypoint index sorted by 2nd best match\"" >> cmd.gp + echo "set terminal png" >> cmd.gp + echo "set output \"/dev/null\"" >> cmd.gp + echo "plot \"tmp/sort-${file}-by-2nd-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"tmp/sort-${file}-by-2nd-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"sort-${file}-by-2nd-match-${TESTNAME}.png\"" >> cmd.gp + echo "replot" >> cmd.gp + ${CMD_GP} cmd.gp + + echo "Calling gnuplot (descriptor summary)" + echo "set view 80, 20, 1, 1.48" > cmd.gp + echo "set xrange [ -1.5 : 1.5 ]" >> cmd.gp + echo "set yrange [ -1.5 : 1.5 ]" >> cmd.gp + echo "set zrange [ -5 : 5 ]" >> cmd.gp + echo "set xtics -1.5,3 offset 0,-0.5" >> cmd.gp + echo "set ytics 1.5,3 offset 0.5" >> cmd.gp + echo "set ztics -3,3" >> cmd.gp + echo "set ticslevel 0" >> cmd.gp + echo "set format cb '%4.1f'" >> cmd.gp + echo "unset colorbox" >> cmd.gp + echo "set pm3d implicit at s" >> cmd.gp + echo "set terminal png size 1080, 250" >> cmd.gp + echo "set output \"descdist-${file}-${TESTNAME}.png\"" >> cmd.gp + echo "set multiplot layout 1,8 title '8 bins in each of 16 sections'" >> cmd.gp + echo "set title 'bin 0 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 0 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 1 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 1 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 2 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 2 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 3 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 3 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 4 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 4 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 5 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 5 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 6 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 6 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "set title 'bin 7 '" >> cmd.gp + echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 7 using 1:2:3:3 with pm3d notitle" >> cmd.gp + ${CMD_GP} cmd.gp + + rm -f cmd.gp + done +done + From 45247e7ca43b62afb30aaa5c4f342e6d2882e20d Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Sat, 27 Feb 2021 16:53:27 +0100 Subject: [PATCH 15/30] [testing] more flexible mode comparison script --- testScripts/compareVersions/Compare.mk.in | 16 +- .../compareVersions/compareWithVLFeat.sh.in | 362 +++++++++++++----- 2 files changed, 275 insertions(+), 103 deletions(-) diff --git a/testScripts/compareVersions/Compare.mk.in b/testScripts/compareVersions/Compare.mk.in index fbe4b42c..ffa8f517 100755 --- a/testScripts/compareVersions/Compare.mk.in +++ b/testScripts/compareVersions/Compare.mk.in @@ -1,9 +1,19 @@ -all: test +all: double -test: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles - sh ./compareWithVLFeat.sh +single: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 popsift vlfeat + +double: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 vlfeat default -2 popsift vlfeat + +triple: @CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles + bash ./compareWithVLFeat.sh -1 vlfeat default -2 popsift default -3 popsift vlfeat clean: + rm -rf popsift-*-popsift-* + rm -rf vlfeat-*-vlfeat-* + rm -rf vlfeat-*-popsift-* + rm -rf popsift-*-vlfeat-* rm -f sort*.txt rm -f sort*.png rm -f *.sift diff --git a/testScripts/compareVersions/compareWithVLFeat.sh.in b/testScripts/compareVersions/compareWithVLFeat.sh.in index 7018d490..cbabc282 100755 --- a/testScripts/compareVersions/compareWithVLFeat.sh.in +++ b/testScripts/compareVersions/compareWithVLFeat.sh.in @@ -10,126 +10,187 @@ CMD_COMPARE=@CMAKE_RUNTIME_OUTPUT_DIRECTORY@/popsift-compareSiftFiles DIR_AWK=@CMAKE_CURRENT_SOURCE_DIR@/awk DIR_TEST=@PopSift_TESTFILE_PATH@ +VLFEAT_DEFAULT_PARAMETERS="" + +# VLFeat multiplies the descriptors with 512 before converting to uchar, +# so we use 9 for a 2**9 multiplier as well. +POPSIFT_DEFAULT_PARAMETERS="--pgmread-loading --norm-mode=classic --norm-multi 9 --write-as-uchar --write-with-ori --initial-blur 0.5 --sigma 1.6 --threshold 0 --edge-threshold 10.0" + +# FILES="hash level1 boat" +# FILES="level1" +# FILES="hash" +# FILES="boat1 boat" +# FILES="boat1" +FILES="boat" + +run_all_extract() { + local file="$1" + local testname="$2" + local srcfile="tmp/${file}-${testname}.sift" + local dstfile="tmp/sum-${file}-${testname}.txt" + local coordfile="tmp/coord-${file}-${testname}.txt" + + ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < ${srcfile} > ${coordfile} + ${CMD_AWK} -f ${DIR_AWK}/compute.awk ${srcfile} > ${dstfile} +} + run_vlfeat() { - file="$1" + local testname="$1" + local file="$2" + local par="$3" + local dstfile="tmp/${file}-${testname}.sift" - echo ${CMD_VLF} -v ${file}.pgm - ${CMD_VLF} -v ${file}.pgm - ${CMD_SORT} -n ${file}.sift > tmp/${file}-vlfeat.sift + local PAR0="${VLFEAT_DEFAULT_PARAMETERS}" + + echo ${CMD_VLF} ${par} -v ${file}.pgm + ${CMD_VLF} ${PAR0} ${par} -v ${file}.pgm + ${CMD_SORT} -n ${file}.sift > ${dstfile} rm ${file}.sift - ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < tmp/${file}-vlfeat.sift > tmp/coord-${file}-vlfeat.txt - ${CMD_AWK} -f ${DIR_AWK}/compute.awk tmp/${file}-vlfeat.sift > tmp/sum-${file}-vlfeat.txt - echo "LEAVE VLFEAT" + run_all_extract $file $testname } run_popsift() { - file="$1" - par="$2" + local testname="$1" + local file="$2" + local par="$3" + local dstfile="tmp/${file}-${testname}.sift" - # VLFeat multiplies the descriptors with 512 before converting to uchar, - # so we use 9 for a 2**9 multiplier as well. - PAR0="--pgmread-loading --norm-mode=classic --norm-multi 9 --write-as-uchar --write-with-ori --initial-blur 0.5 --sigma 1.6 --threshold 0 --edge-threshold 10.0" + local PAR0="${POPSIFT_DEFAULT_PARAMETERS}" echo ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm ${CMD_POP} ${PAR0} ${par} -i ${file}.pgm - ${CMD_SORT} -n output-features.txt > tmp/${file}-popsift.sift + ${CMD_SORT} -n output-features.txt > ${dstfile} rm output-features.txt - ${CMD_AWK} -e '{printf("%f %f %f %f\n",$1,$2,$3,$4);}' < tmp/${file}-popsift.sift > tmp/coord-${file}-popsift.txt - ${CMD_AWK} -f ${DIR_AWK}/compute.awk tmp/${file}-popsift.sift > tmp/sum-${file}-popsift.txt - echo "LEAVE POPSIFT" + run_all_extract $file $testname } -mkdir -p tmp - -# for TESTNAME in default new twobins vlfeatdesc vlfeat -for TESTNAME in new vlfeatdesc -do - - rm -f hash.sift* - rm -f level1.sift* - rm -f coord-* - rm -f sum-* - - # FILES="hash level1 boat" - # FILES="level1" - # FILES="hash" - FILES="boat" - # FILES="boat1" - - for file in ${FILES} ; do +prepare_testfiles() { + for file in $1 ; do if [ ! -f ${file}.pgm ] then ${CMD_CONVERT} ${DIR_TEST}/$file/img1.ppm ${file}.pgm - if [ $? -ne 0 ] - then - echo "${CMD_CONVERT} failed, test image not converted" - exit -1 + if [ $? -ne 0 ] + then + echo " ${CMD_CONVERT} failed, test image not converted" + exit -1 + else + echo " converted ${file} to PGM" + fi + fi + done +} + +parse_commandline() { + local i=0 + while [ $# -ne 0 ] + do + # echo " #: $# p$i: p0=$0 p1=$1 p*=$*" + if [[ "$1" =~ ^\-[0-9]$ ]] ; then + shift 1 + if [ "$1" == "vlfeat" ] ; then + configurations["$i"]="vlfeat-$2" + echo "Valid configuration vlfeat-$2" + shift 1 + elif [ "$1" == "popsift" ] ; then + configurations["$i"]="popsift-$2" + echo "Valid configuration popsift-$2" + shift 1 + else + echo "Invalid configuration $1, skipping" fi + elif [ $1 == "-h" ] ; then + echo "Usage: $0 [ - ]+" + echo " is a single-digital integer" + echo " is either 'vlfeat ' or 'popsift '" + echo " where for vlfeat is 'default'" + echo " and for popsift is 'default', 'loop' or 'vlfeat'" + exit 0 fi + i=$((i+1)) + shift 1 + done +} - if [ "$TESTNAME" = "default" ]; then - echo "Test is default" - PAR1=" " - elif [ "$TESTNAME" = "new" ]; then - echo "Test is new: loopdescriptor and BestBin" - PAR1="--desc-mode=loop" - elif [ "$TESTNAME" = "vlfeatdesc" ]; then - echo "Test is vlfeatdesc: vlfeat descriptor and BestBin" - PAR1="--desc-mode=vlfeat" - else - echo "Test is undefined, $TESTNAME" - exit - fi +compare_configurations() { + local left=$1 + local right=$2 + local dest="$left-$right" + mkdir -p ${dest}/tmp + mkdir -p ${dest} + + for file in ${FILES} ; do + echo " ----------------------------------------------------------------------------" + echo " BEGIN brute force matching ${file} for ${dest}" + echo " comparison tool: ${CMD_COMPARE}" + echo " left input: tmp/${file}-${left}.sift" + echo " right input: tmp/${file}-${right}.sift" + ${CMD_COMPARE} -o ${dest}/tmp/UML-${file}.txt \ + -d ${dest}/tmp/descdist-${file}.txt \ + tmp/${file}-${left}.sift \ + tmp/${file}-${right}.sift + + echo " END brute force matching ${file} for ${dest}" + echo " ----------------------------------------------------------------------------" + done +} + +sort_configuration() { + local dest=$1 + + for file in ${FILES} ; do + echo " Sorting ${dest} for ${file}" + ${CMD_SORT} -k3 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-1st-match.txt + ${CMD_SORT} -k6 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-pixdist.txt + ${CMD_SORT} -k8 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-angle.txt + ${CMD_SORT} -k10 -g ${dest}/tmp/UML-${file}.txt > ${dest}/tmp/sort-${file}-by-2nd-match.txt + done +} + +make_distance_stats() { + local dest=$1 + local awkfile="${DIR_AWK}/desc-to-heat.awk" + local srcfile="${dest}/tmp/descdist-${file}.txt" + local dstfile="${dest}/descdist-${file}.txt" + + for file in ${FILES} ; do + echo " Converting descriptor distance stats" + awk -vCOL=3 -f ${awkfile} ${srcfile} > ${dstfile} + awk -vCOL=4 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=5 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=6 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=7 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=8 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=9 -f ${awkfile} ${srcfile} >> ${dstfile} + awk -vCOL=10 -f ${awkfile} ${srcfile} >> ${dstfile} + done +} - run_vlfeat ${file} - - run_popsift ${file} ${PAR1} - - echo "Perform brute force matching" - echo ${CMD_COMPARE} ${file}-popsift.sift ${file}-vlfeat.sift - ${CMD_COMPARE} -o tmp/UML-${file}.txt \ - -d tmp/descdist-${file}-${TESTNAME}.txt \ - tmp/${file}-popsift.sift \ - tmp/${file}-vlfeat.sift - - echo "Sorting" - ${CMD_SORT} -k3 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-1st-match.txt - ${CMD_SORT} -k6 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-pixdist.txt - ${CMD_SORT} -k8 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-angle.txt - ${CMD_SORT} -k10 -g tmp/UML-${file}.txt > tmp/sort-${file}-by-2nd-match.txt - - echo "Converting descriptor distance stats" - awk -vCOL=3 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt > descdist-${file}-${TESTNAME}.txt - awk -vCOL=4 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=5 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=6 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=7 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=8 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=9 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt - awk -vCOL=10 -f ${DIR_AWK}/desc-to-heat.awk tmp/descdist-${file}-${TESTNAME}.txt >> descdist-${file}-${TESTNAME}.txt +make_plots() { + local dest=$1 + for file in ${FILES} ; do echo "Calling gnuplot (pixdist)" - echo "set title \"L2 distance between pixels, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set title \"L2 distance between pixels, ${dest}" > cmd.gp echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp echo "set logscale y" >> cmd.gp echo "set terminal png" >> cmd.gp - echo "set output \"sort-${file}-by-pixdist-${TESTNAME}.png\"" >> cmd.gp - echo "plot \"tmp/sort-${file}-by-pixdist.txt\" using (\$6+0.00001) notitle" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-pixdist.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-pixdist.txt\" using (\$6+0.00001) notitle" >> cmd.gp ${CMD_GP} cmd.gp echo "Calling gnuplot (1st dist)" - echo "set title \"L2 distance between descriptors, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set title \"L2 distance between descriptors, ${dest}" > cmd.gp echo "set xlabel \"Keypoint index sorted by closest best match\"" >> cmd.gp echo "set terminal png" >> cmd.gp echo "set output \"/dev/null\"" >> cmd.gp - echo "plot \"tmp/sort-${file}-by-1st-match.txt\" using 3 title \"best distance\"" >> cmd.gp - echo "replot \"tmp/sort-${file}-by-1st-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp - echo "set output \"sort-${file}-by-1st-match-${TESTNAME}.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-1st-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"${dest}/tmp/sort-${file}-by-1st-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-1st-match.png\"" >> cmd.gp echo "replot" >> cmd.gp ${CMD_GP} cmd.gp echo "Calling gnuplot for angular diff (1st dist)" - echo "set title \"Distance in degree between orientations, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set title \"Distance in degree between orientations, ${dest}" > cmd.gp echo "set ylabel \"Difference (degree)\"" >> cmd.gp echo "set xlabel \"Keypoint index sorted by orientation difference\"" >> cmd.gp echo "set grid" >> cmd.gp @@ -137,18 +198,18 @@ do echo "set yrange [0.001:*]" >> cmd.gp echo "set style data lines" >> cmd.gp echo "set terminal png" >> cmd.gp - echo "set output \"sort-${file}-by-angle-${TESTNAME}.png\"" >> cmd.gp - echo "plot \"tmp/sort-${file}-by-angle.txt\" using 8 notitle" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-angle.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-angle.txt\" using 8 notitle" >> cmd.gp ${CMD_GP} cmd.gp echo "Calling gnuplot (2nd dist)" - echo "set title \"L2 distance between descriptors, PopSift ${TESTNAME} vs VLFeat" > cmd.gp + echo "set title \"L2 distance between descriptors, ${dest}" > cmd.gp echo "set xlabel \"Keypoint index sorted by 2nd best match\"" >> cmd.gp echo "set terminal png" >> cmd.gp echo "set output \"/dev/null\"" >> cmd.gp - echo "plot \"tmp/sort-${file}-by-2nd-match.txt\" using 3 title \"best distance\"" >> cmd.gp - echo "replot \"tmp/sort-${file}-by-2nd-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp - echo "set output \"sort-${file}-by-2nd-match-${TESTNAME}.png\"" >> cmd.gp + echo "plot \"${dest}/tmp/sort-${file}-by-2nd-match.txt\" using 3 title \"best distance\"" >> cmd.gp + echo "replot \"${dest}/tmp/sort-${file}-by-2nd-match.txt\" using 10 title \"2nd best distance\"" >> cmd.gp + echo "set output \"${dest}/sort-${file}-by-2nd-match.png\"" >> cmd.gp echo "replot" >> cmd.gp ${CMD_GP} cmd.gp @@ -165,27 +226,128 @@ do echo "unset colorbox" >> cmd.gp echo "set pm3d implicit at s" >> cmd.gp echo "set terminal png size 1080, 250" >> cmd.gp - echo "set output \"descdist-${file}-${TESTNAME}.png\"" >> cmd.gp + echo "set output \"${dest}/descdist-${file}.png\"" >> cmd.gp echo "set multiplot layout 1,8 title '8 bins in each of 16 sections'" >> cmd.gp echo "set title 'bin 0 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 0 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 0 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 1 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 1 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 1 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 2 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 2 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 2 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 3 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 3 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 3 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 4 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 4 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 4 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 5 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 5 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 5 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 6 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 6 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 6 using 1:2:3:3 with pm3d notitle" >> cmd.gp echo "set title 'bin 7 '" >> cmd.gp - echo "splot \"descdist-${file}-${TESTNAME}.txt\" index 7 using 1:2:3:3 with pm3d notitle" >> cmd.gp + echo "splot \"${dest}/descdist-${file}.txt\" index 7 using 1:2:3:3 with pm3d notitle" >> cmd.gp ${CMD_GP} cmd.gp rm -f cmd.gp done +} + +mkdir -p tmp + +declare -A configurations +parse_commandline $* + +for i in "${!configurations[@]}" +do + echo " Conf $i : ${configurations[$i]}" done +prepare_testfiles "${FILES}" + +echo "################################################################################" +echo "All configurations: ${configurations[@]}" +echo "################################################################################" + +for TESTNAME in "${configurations[@]}" ; do + rm -f hash.sift* + rm -f level1.sift* + rm -f coord-* + rm -f sum-* + + for file in ${FILES} ; do + if [ "$TESTNAME" = "vlfeat-default" ]; then + echo "Test is vlfeat" + PAR1=" " + run_vlfeat ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-default" ]; then + echo "Test is popsift-default" + PAR1=" " + run_popsift ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-loop" ]; then + echo "Test is popsift-loop: --desc-mode=loop" + PAR1="--desc-mode=loop" + run_popsift ${TESTNAME} ${file} ${PAR1} + elif [ "$TESTNAME" = "popsift-vlfeat" ]; then + echo "Test is popsift-vlfeat: --desc-mode=vlfeat" + PAR1="--desc-mode=vlfeat" + run_popsift ${TESTNAME} ${file} ${PAR1} + else + echo "Test is undefined, $TESTNAME" + exit + fi + done +done + +echo "################################################################################" +echo "Configurations valid for comparison" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + echo " $left vs $right" + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +echo "################################################################################" +echo "Performing brute-force matching" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + compare_configurations $left $right + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +echo "################################################################################" +echo "Creating output" +echo "################################################################################" + +for left in "${configurations[@]}" ; do + mustCompare=0 + for right in "${configurations[@]}" ; do + if [ ${mustCompare} -eq 1 ] ; then + sort_configuration "$left-$right" + make_distance_stats "$left-$right" + make_plots "$left-$right" + else + if [ "$left" == "$right" ] ; then + mustCompare=1 + fi + fi + done +done + +exit 0 + From 9ed7e695638858621f43139949052824eb68afd0 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Tue, 9 Mar 2021 18:38:42 +0100 Subject: [PATCH 16/30] [popsift] introduce CUDA cooperative groups 4x speed vlfeat descriptor extraction --- CMakeLists.txt | 6 +- cmake/sift_config.h.in | 12 ++-- src/CMakeLists.txt | 1 + src/popsift/s_desc_vlfeat.cu | 98 +++++++++++++++++++++--------- src/popsift/s_gradiant.h | 37 +++++++++++ src/popsift/s_gradiant_cuda9plus.h | 54 ++++++++++++++++ 6 files changed, 173 insertions(+), 35 deletions(-) create mode 100644 src/popsift/s_gradiant_cuda9plus.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f39f2fec..4e530169 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,9 +181,11 @@ else() endif() if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") - set(HAVE_SHFL_DOWN_SYNC 1) + set(PopSift_HAVE_SHFL_DOWN_SYNC 1) + set(PopSift_HAVE_COOPERATIVE_GROUPS 1) else() - set(HAVE_SHFL_DOWN_SYNC 0) + set(PopSift_HAVE_SHFL_DOWN_SYNC 0) + set(PopSift_HAVE_COOPERATIVE_GROUPS 0) endif() if(NOT PopSift_USE_GRID_FILTER) diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in index 427cfe42..beeed700 100644 --- a/cmake/sift_config.h.in +++ b/cmake/sift_config.h.in @@ -8,11 +8,13 @@ #pragma once -#define POPSIFT_IS_DEFINED(F) F() == 1 +#define POPSIFT_IS_DEFINED(F) F() == 1 +#define POPSIFT_IS_UNDEFINED(F) F() == 0 -#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @HAVE_SHFL_DOWN_SYNC@ -#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@ -#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ -#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ +#define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@ +#define POPSIFT_HAVE_NORMF() @PopSift_HAVE_NORMF@ +#define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ +#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ +#define POPSIFT_HAVE_COOPERATIVE_GROUPS() @PopSift_HAVE_COOPERATIVE_GROUPS@ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index db7aeadc..301c6cd5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -28,6 +28,7 @@ CUDA_ADD_LIBRARY(popsift popsift/s_desc_norm_l2.h popsift/s_desc_normalize.h popsift/s_gradiant.h + popsift/s_gradiant_cuda9plus.h popsift/s_solve.h popsift/common/assist.cu popsift/common/assist.h popsift/common/clamp.h diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 375fb7eb..f714f85d 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -6,6 +6,8 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "popsift/sift_config.h" + #include "common/assist.h" #include "common/debug_macros.h" #include "common/vec_macros.h" @@ -62,27 +64,62 @@ void ext_desc_vlfeat_sub( const float ang, const int ymin = max(1, (int)floorf(y - pty - bsz)); const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); - __shared__ float dpt[4][128]; + + __shared__ float dpt[128]; + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + cg::thread_block block = cg::this_thread_block(); + cg::thread_block_tile<32> tile = cg::tiled_partition<32>( block ); + + for( int i=tile.thread_rank(); i<128; i+=tile.size() ) + { + dpt[i] = 0.0f; + } + + tile.sync(); +#else for( int i=threadIdx.x; i<128; i+=blockDim.x ) { - dpt[0][i] = 0.0f; - dpt[1][i] = 0.0f; - dpt[2][i] = 0.0f; - dpt[3][i] = 0.0f; + dpt[i] = 0.0f; } - __syncthreads(); - // we have 32 threads in a warp, how to use them? - const int loop = threadIdx.x >> 3; // 0 - 3 - const int xstart = ( threadIdx.x & 0x7 ); - const int xstep = 8; + __syncthreads(); +#endif - for( int pix_y = ymin+loop; popsift::any(pix_y <= ymax); pix_y += 4 ) + for( int pix_y = ymin; pix_y <= ymax; pix_y += 1 ) { - for( int pix_x = xmin+xstart; popsift::any(pix_x <= xmax); pix_x+=xstep ) + for( int base_x = xmin; base_x <= xmax; base_x += 32 ) { + float mod; + float th; + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + get_gradiant32( tile, mod, th, base_x, pix_y, layer_tex, level ); +#else + get_gradiant32( mod, th, base_x, pix_y, layer_tex, level ); +#endif + + mod /= 2.0f; // Our mod is double that of vlfeat. Huh. + + th -= ang; + while( th > M_PI2 ) th -= M_PI2; + while( th < 0.0f ) th += M_PI2; +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + tile.sync(); + + const int pix_x = base_x + tile.thread_rank(); +#else + __syncthreads(); + + const int pix_x = base_x + threadIdx.x; +#endif + if( ( pix_y <= ymax ) && ( pix_x <= xmax ) ) { +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + cg::coalesced_group co_tile = cg::coalesced_threads(); +#endif + // d : distance from keypoint const float2 d = make_float2( pix_x - x, pix_y - y ); @@ -90,26 +127,15 @@ void ext_desc_vlfeat_sub( const float ang, const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), ::fmaf( crsbp, d.y, -srsbp * d.x ) ); - float mod; - float th; - - get_gradiant( mod, th, pix_x, pix_y, layer_tex, level ); - - mod /= 2; // Our mod is double that of vlfeat. Huh. - const float ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3)); - th -= ang; - while( th > M_PI2 ) th -= M_PI2; - while( th < 0.0f ) th += M_PI2; - const float nt = 8.0f * th / M_PI2; // neighbouring tile on the lower side: -2, -1, 0 or 1 // (must use floorf because casting rounds towards zero const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), - (int)floorf(n.y - 0.5f), - (int)nt ); + (int)floorf(n.y - 0.5f), + (int)nt ); const float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); const float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); const float wgt_t = - ( nt - t0.z ); @@ -144,21 +170,37 @@ void ext_desc_vlfeat_sub( const float ang, + ( t0.x + tx ) * 8 + ( t0.z + tt ) % 8; - atomicAdd( &dpt[loop][offset], val ); + atomicAdd( &dpt[offset], val ); } + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + co_tile.sync(); +#else + // cannot be done before CUDA 9 +#endif } } } } +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + tile.sync(); +#else __syncthreads(); +#endif } } +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + for( int i=tile.thread_rank(); i<128; i+=tile.size() ) + { + features[i] = dpt[i]; + } +#else for( int i=threadIdx.x; i<128; i+=blockDim.x ) { - float f = dpt[0][i] + dpt[1][i] + dpt[2][i] + dpt[3][i]; - features[i] = f; + features[i] = dpt[i]; } +#endif } __global__ void ext_desc_vlfeat(int octave, cudaTextureObject_t layer_tex, int w, int h) diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h index aaec9e2d..3f648cf5 100644 --- a/src/popsift/s_gradiant.h +++ b/src/popsift/s_gradiant.h @@ -14,6 +14,10 @@ #include #include +#include "popsift/sift_config.h" + +#include "s_gradiant_cuda9plus.h" // functions that work with CUDA SDK 9 and later + namespace popsift { /* @@ -68,6 +72,39 @@ void get_gradiant( float& grad, theta = atan2f(dy, dx); } +#if POPSIFT_IS_UNDEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) +/* A version of get_gradiant that works for a (32,1,1) threadblock + * and pulls data to shared memory before computing. Data is pulled + * less frequently, meaning that we do not rely on the L1 cache. + */ +__device__ static inline +void get_gradiant32( float& grad, + float& theta, + const int x, + const int y, + cudaTextureObject_t layer, + const int level ) +{ + const int idx = threadIdx.x; + + __shared__ float x_array[34]; + + for( int i=idx; i<34; i += blockDim.x ) + { + x_array[i] = readTex( layer, x+i-1.0f, y, level ); + } + __syncthreads(); + + const float dx = x_array[idx+2] - x_array[idx]; + + const float dy = readTex( layer, x+idx, y+1.0f, level ) + - readTex( layer, x+idx, y-1.0f, level ); + + grad = hypotf( dx, dy ); // __fsqrt_rz(dx*dx + dy*dy); + theta = atan2f(dy, dx); +} +#endif + __device__ static inline void get_gradiant( float& grad, float& theta, diff --git a/src/popsift/s_gradiant_cuda9plus.h b/src/popsift/s_gradiant_cuda9plus.h new file mode 100644 index 00000000..6fe24623 --- /dev/null +++ b/src/popsift/s_gradiant_cuda9plus.h @@ -0,0 +1,54 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#if POPSIFT_IS_DEFINED(POPSIFT_HAVE_COOPERATIVE_GROUPS) + +#include + +namespace cg = cooperative_groups; + +namespace popsift +{ + +/* A version of get_gradiant that works for a (32,1,1) threadblock + * and pulls data to shared memory before computing. Data is pulled + * less frequently, meaning that we do not rely on the L1 cache. + */ +__device__ static inline +void get_gradiant32( cg::thread_block_tile<32>& tile, + float& grad, + float& theta, + const int x, + const int y, + cudaTextureObject_t layer, + const int level ) +{ + const int idx = tile.thread_rank(); + + __shared__ float x_array[34]; + + for( int i=idx; i<34; i += tile.size() ) + { + x_array[i] = readTex( layer, x+i-1.0f, y, level ); + } + tile.sync(); + + const float dx = x_array[idx+2] - x_array[idx]; + + const float dy = readTex( layer, x+idx, y+1.0f, level ) + - readTex( layer, x+idx, y-1.0f, level ); + + grad = hypotf( dx, dy ); // __fsqrt_rz(dx*dx + dy*dy); + theta = atan2f(dy, dx); +} + +}; // namespace popsift + +#endif // POPSIFT_HAVE_COOPERATIVE_GROUPS + From 46c63a6e53b10588cf408a6cb139307cd9fcd096 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Wed, 10 Mar 2021 10:22:28 +0100 Subject: [PATCH 17/30] [testing] minor speed improvements by manually finding min1/min2 --- src/compareSiftFiles/csf_feat.cpp | 128 +++++++++++++++++++----------- 1 file changed, 80 insertions(+), 48 deletions(-) diff --git a/src/compareSiftFiles/csf_feat.cpp b/src/compareSiftFiles/csf_feat.cpp index 2dfe067b..1a3758c8 100755 --- a/src/compareSiftFiles/csf_feat.cpp +++ b/src/compareSiftFiles/csf_feat.cpp @@ -19,10 +19,8 @@ const float M_PI2 = 2.0f * 3.14159265358979323846f; bool feat_t::_use_l2_distance = true; -static float dist( const feat_t& l, const feat_t& r ) +inline static float sum_of_squares( const feat_t& l, const feat_t& r ) { - if( feat_t::_use_l2_distance ) - { #ifdef HAVE_STD_TRANSFORM_REDUCE /* From C++17 numeric: template& l_one, vector& desc_stats, bool minOnly ) const +struct Find2Best { - const int l_one_sz = l_one.size(); + float min1 { std::numeric_limits::infinity() }; + float min2 { std::numeric_limits::infinity() }; - vector distances( l_one_sz ); + int idx1; + int idx2; - if( !minOnly ) ostr << "==========" << endl; - - const feat_t& left( *this ); + inline void update( const int& idx, const float& val ) + { + if( val < min1 ) + { + min2 = min1; + idx2 = idx1; + min1 = val; + idx1 = idx; + } + else if( val < min2 ) + { + min2 = val; + idx2 = idx; + } + } - for( int j=0; j& l_one, vector& desc_stats, bool minOnly ) const +{ + const int l_one_sz = l_one.size(); + + if( !minOnly ) ostr << "==========" << endl; + + const feat_t& left( *this ); if( minOnly ) { - auto it = distances.begin(); - float second = INFINITY; - const vector& left = desc; + Find2Best find2best; for( int j=0; j *m ) + find2best.takeSqrt(); + + const feat_t& r = l_one[find2best.idx1]; + + ostr << "desc dist " << find2best.min1 + << " MIN" + << " pixdist " << sqrtf( (x-r.x)*(x-r.x) + (y-r.y)*(y-r.y) ) + << " angledist " << fabsf( ori/M_PI2*360.0f - r.ori/M_PI2*360.0f ); + + if( dstr ) + { + auto right = r.desc; + for( int i=0; i( second, *it ); + const float d = left.desc[i] - right[i]; + (*dstr) << d << " "; + desc_stats[i] += d; } - it++; + (*dstr) << endl; } - ostr << " 2best " << second << endl; + ostr << " 2best " << find2best.min2 << endl; } else { + vector distances( l_one_sz ); + + for( int j=0; j Date: Tue, 16 Mar 2021 13:51:30 +0100 Subject: [PATCH 18/30] [debug] moved kernels out of namespace to make profiling possible --- CMakeLists.txt | 2 ++ src/popsift/s_pyramid_build.cu | 4 ++-- src/popsift/s_pyramid_build_aa.cu | 32 +++++++++++++++---------------- src/popsift/s_pyramid_build_aa.h | 8 ++++---- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e530169..ccb416f5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,8 @@ if(MSVC) list(APPEND CUDA_NVCC_FLAGS -Xcompiler ${PopSift_MVSC_LINKER}) endif() +list(APPEND CUDA_NVCC_FLAGS "-lineinfo") + # default stream per-thread implies that each host thread has one non-synchronizing 0-stream # currently, the code requires legacy mode list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy") diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu index 8873ca5c..c8a3a3ee 100755 --- a/src/popsift/s_pyramid_build.cu +++ b/src/popsift/s_pyramid_build.cu @@ -279,7 +279,7 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t grid.x = grid_divide( width, 32 ); grid.y = grid_divide( height, block.y ); - gauss::absoluteSource::horiz + as_horiz <<>> ( oct_obj.getDataTexPoint( ), oct_obj.getIntermediateSurface( ), @@ -345,7 +345,7 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea grid.x = (unsigned int)grid_divide( width, block.x ); grid.y = (unsigned int)grid_divide( height, block.y ); - gauss::absoluteSource::vert + as_vert <<>> ( oct_obj.getIntermDataTexPoint( ), oct_obj.getDataSurface( ), diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu index c026a8b7..bc8382ed 100755 --- a/src/popsift/s_pyramid_build_aa.cu +++ b/src/popsift/s_pyramid_build_aa.cu @@ -10,24 +10,20 @@ #include "s_pyramid_build_aa.h" #include "sift_constants.h" -namespace popsift { -namespace gauss { -namespace absoluteSource { - -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void as_horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int src_level = dst_level - 1; - const int span = d_gauss.inc.span[dst_level]; - const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; + const int span = popsift::d_gauss.inc.span[dst_level]; + const float* filter = &popsift::d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; const int off_x = blockIdx.x * blockDim.x + threadIdx.x; const int off_y = blockIdx.y * blockDim.y + threadIdx.y; float out = 0.0f; - float A = readTex( src_point_texture, off_x - span, off_y, src_level ); - float B = readTex( src_point_texture, off_x + span, off_y, src_level ); - float C = readTex( src_point_texture, off_x , off_y, src_level ); + float A = popsift::readTex( src_point_texture, off_x - span, off_y, src_level ); + float B = popsift::readTex( src_point_texture, off_x + span, off_y, src_level ); + float C = popsift::readTex( src_point_texture, off_x , off_y, src_level ); float g = filter[0]; out += C * g; g = filter[span]; @@ -49,10 +45,10 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero ); } -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void as_vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { - const int span = d_gauss.inc.span[dst_level]; - const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; + const int span = popsift::d_gauss.inc.span[dst_level]; + const float* filter = &popsift::d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; int block_x = blockIdx.x * blockDim.x; int block_y = blockIdx.y * blockDim.y; int idx = threadIdx.x; @@ -66,17 +62,17 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t g = filter[offset]; idy = threadIdx.y - offset; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); idy = threadIdx.y + offset; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); } g = filter[0]; idy = threadIdx.y; - val = readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); + val = popsift::readTex( src_point_texture, block_x + idx, block_y + idy, dst_level ); out += ( val * g ); idx = block_x+threadIdx.x; @@ -85,6 +81,10 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero ); } +namespace popsift { +namespace gauss { +namespace absoluteSource { + __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.abs_o0.span[dst_level]; diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h index 4d3423cf..c09d0ef2 100755 --- a/src/popsift/s_pyramid_build_aa.h +++ b/src/popsift/s_pyramid_build_aa.h @@ -7,14 +7,14 @@ */ #include "common/plane_2d.h" +__global__ void as_horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); + +__global__ void as_vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); + namespace popsift { namespace gauss { namespace absoluteSource { -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); - -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); - __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); __global__ void vert_all_abs0(cudaTextureObject_t src_point_texture, From 116eb2501622498a48dbd503725098a76dc9124e Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 2 Jul 2021 09:34:46 +0200 Subject: [PATCH 19/30] [feature] FeaturesDev::matchAndReturn gives you brute-forced matches on CPU and GPU --- src/popsift/features.cu | 35 +++++++++++++++++++++++++++++------ src/popsift/features.h | 24 +++++++++++++++++++++--- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/popsift/features.cu b/src/popsift/features.cu index d789c441..87245af7 100755 --- a/src/popsift/features.cu +++ b/src/popsift/features.cu @@ -151,9 +151,9 @@ void FeaturesDev::reset( int num_ext, int num_ori ) if( _ori != nullptr ) { cudaFree( _ori ); _ori = nullptr; } if( _rev != nullptr ) { cudaFree( _rev ); _rev = nullptr; } - _ext = popsift::cuda::malloc_devT ( num_ext, __FILE__, __LINE__ ); - _ori = popsift::cuda::malloc_devT( num_ori, __FILE__, __LINE__ ); - _rev = popsift::cuda::malloc_devT ( num_ori, __FILE__, __LINE__ ); + _ext = popsift::cuda::malloc_mgdT ( num_ext, __FILE__, __LINE__ ); + _ori = popsift::cuda::malloc_mgdT( num_ori, __FILE__, __LINE__ ); + _rev = popsift::cuda::malloc_mgdT ( num_ori, __FILE__, __LINE__ ); setFeatureCount( num_ext ); setDescriptorCount( num_ori ); @@ -240,9 +240,9 @@ show_distance( int3* match_matrix, const float4* lptr = (const float4*)( &l_ori[i] ); const float4* rptr1 = (const float4*)( &r_ori[match_matrix[i].x] ); const float4* rptr2 = (const float4*)( &r_ori[match_matrix[i].y] ); - float d1 = l2_in_t0( lptr, rptr1 ); - float d2 = l2_in_t0( lptr, rptr2 ); - if( threadIdx.x == 0 ) + float d1 = l2_in_t0( lptr, rptr1 ); + float d2 = l2_in_t0( lptr, rptr2 ); + if( threadIdx.x == 0 ) { if( match_matrix[i].z ) printf( "accept feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f\n", @@ -300,6 +300,29 @@ void FeaturesDev::match( FeaturesDev* other ) cudaFree( match_matrix ); } +int3* FeaturesDev::matchAndReturn( FeaturesDev* other ) +{ + int l_len = getDescriptorCount( ); + int r_len = other->getDescriptorCount( ); + + int3* match_matrix = popsift::cuda::malloc_mgdT( l_len, __FILE__, __LINE__ ); + + dim3 grid; + grid.x = l_len; + grid.y = 1; + grid.z = 1; + dim3 block; + block.x = 32; + block.y = 1; + block.z = 1; + + compute_distance + <<>> + ( match_matrix, getDescriptors(), l_len, other->getDescriptors(), r_len ); + + return match_matrix; +} + /************************************************************* * Feature *************************************************************/ diff --git a/src/popsift/features.h b/src/popsift/features.h index a05a8197..e6c721f0 100755 --- a/src/popsift/features.h +++ b/src/popsift/features.h @@ -103,9 +103,9 @@ std::ostream& operator<<( std::ostream& ostr, const FeaturesHost& feature ); class FeaturesDev : public FeaturesBase { - Feature* _ext; - Descriptor* _ori; - int* _rev; // the reverse map from descriptors to extrema + Feature* _ext; // array of extrema + Descriptor* _ori; // array of desciptors + int* _rev; // the reverse map from descriptors to extrema public: FeaturesDev( ); @@ -114,8 +114,26 @@ class FeaturesDev : public FeaturesBase void reset( int num_ext, int num_ori ); + /** This function performs one-directional brute force matching on + * the GPU between the Descriptors in this objects and the object + * other. + * The resulting matches are printed. + */ void match( FeaturesDev* other ); + /** This function performs one-directional brute force matching on + * the GPU between the Descriptors in this objects and the object + * other. + * The resulting matches are returned in an array of int3 that must + * be released with a call to cudaFree(). + * The length of the array is this->getDescriptorCount(). + * For each element: + * int3.x is the index of a descriptor in this->getDescriptors() + * int3.y is an index in other->getDescriptors() + * int3.z indicates if the match is valid (non-zero) or not (zero) + */ + int3* matchAndReturn( FeaturesDev* other ); + inline Feature* getFeatures() { return _ext; } inline Descriptor* getDescriptors() { return _ori; } inline int* getReverseMap() { return _rev; } From c992910b58dbf1ed0d4e896813eb5d40d1f6321f Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 2 Jul 2021 13:25:36 +0200 Subject: [PATCH 20/30] [demo] popsift-match changed for CPU-size match printing --- src/application/main.cpp | 2 ++ src/application/match.cpp | 20 +++++++++++++++++++- src/popsift/features.cu | 34 +++++++++++++++++++++++++++++++--- src/popsift/features.h | 12 +++++++++--- 4 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/application/main.cpp b/src/application/main.cpp index 46c99b20..31385992 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -232,6 +232,8 @@ int main(int argc, char **argv) std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl; + config.setDescMode( popsift::Config::VLFeat_Desc ); + try { parseargs( argc, argv, config, inputFile ); // Parse command line std::cout << inputFile << std::endl; diff --git a/src/application/match.cpp b/src/application/match.cpp index 0898774f..400b6c7b 100755 --- a/src/application/match.cpp +++ b/src/application/match.cpp @@ -187,6 +187,8 @@ int main(int argc, char **argv) std::cout << "PopSift version: " << POPSIFT_VERSION_STRING << std::endl; + config.setDescMode( popsift::Config::VLFeat_Desc ); + try { parseargs( argc, argv, config, lFile, rFile ); // Parse command line std::cout << lFile << " <-> " << rFile << std::endl; @@ -227,7 +229,23 @@ int main(int argc, char **argv) cout << "Number of features: " << rFeatures->getFeatureCount() << endl; cout << "Number of descriptors: " << rFeatures->getDescriptorCount() << endl; - lFeatures->match( rFeatures ); + int3* matches = lFeatures->matchAndReturn( rFeatures ); + cudaDeviceSynchronize(); + + for( int i=0; igetDescriptorCount(); i++ ) + { + int3& match = matches[i]; + if( match.z ) + { + const popsift::Feature* l_f = lFeatures->getFeatureForDescriptor( i ); + const popsift::Feature* r_f = rFeatures->getFeatureForDescriptor( match.x ); + cout << setprecision(5) << showpoint + << "point (" << l_f->xpos << "," << l_f->ypos << ") in l matches " + << "point (" << r_f->xpos << "," << r_f->ypos << ") in r" << endl; + } + } + + cudaFree( matches ); delete lFeatures; delete rFeatures; diff --git a/src/popsift/features.cu b/src/popsift/features.cu index 87245af7..43837716 100755 --- a/src/popsift/features.cu +++ b/src/popsift/features.cu @@ -245,17 +245,25 @@ show_distance( int3* match_matrix, if( threadIdx.x == 0 ) { if( match_matrix[i].z ) - printf( "accept feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f\n", + { + Feature* lx = &l_ext[l_fem[i]]; + Feature* rx = &r_ext[r_fem[match_matrix[i].x]]; + printf( "accept feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f" + " (%.1f,%.1f)-(%.1f,%.1f)\n", l_fem[i], i, r_fem[match_matrix[i].x], match_matrix[i].x, r_fem[match_matrix[i].y], match_matrix[i].y, - d1, d2 ); - else + d1, d2, + lx->xpos, lx->ypos, rx->xpos, rx->ypos ); + } + else + { printf( "reject feat %4d [%4d] matches feat %4d [%4d] ( 2nd feat %4d [%4d] ) dist %.3f vs %.3f\n", l_fem[i], i, r_fem[match_matrix[i].x], match_matrix[i].x, r_fem[match_matrix[i].y], match_matrix[i].y, d1, d2 ); + } } __syncthreads(); } @@ -323,6 +331,26 @@ int3* FeaturesDev::matchAndReturn( FeaturesDev* other ) return match_matrix; } +Descriptor* FeaturesDev::getDescriptor( int descIndex ) +{ + return &_ori[descIndex]; +} + +const Descriptor* FeaturesDev::getDescriptor( int descIndex ) const +{ + return &_ori[descIndex]; +} + +Feature* FeaturesDev::getFeatureForDescriptor( int descIndex ) +{ + return &_ext[_rev[descIndex]]; +} + +const Feature* FeaturesDev::getFeatureForDescriptor( int descIndex ) const +{ + return &_ext[_rev[descIndex]]; +} + /************************************************************* * Feature *************************************************************/ diff --git a/src/popsift/features.h b/src/popsift/features.h index e6c721f0..b2c2e967 100755 --- a/src/popsift/features.h +++ b/src/popsift/features.h @@ -127,9 +127,10 @@ class FeaturesDev : public FeaturesBase * The resulting matches are returned in an array of int3 that must * be released with a call to cudaFree(). * The length of the array is this->getDescriptorCount(). - * For each element: - * int3.x is the index of a descriptor in this->getDescriptors() - * int3.y is an index in other->getDescriptors() + * For each element at position i + * i is the index of a descriptor in this->getDescriptors() + * int3.x is the index of the best match in other->getDescriptors() + * int3.y is the index of the second best match in other->getDescriptors() * int3.z indicates if the match is valid (non-zero) or not (zero) */ int3* matchAndReturn( FeaturesDev* other ); @@ -137,6 +138,11 @@ class FeaturesDev : public FeaturesBase inline Feature* getFeatures() { return _ext; } inline Descriptor* getDescriptors() { return _ori; } inline int* getReverseMap() { return _rev; } + + Descriptor* getDescriptor( int descIndex ); + const Descriptor* getDescriptor( int descIndex ) const; + Feature* getFeatureForDescriptor( int descIndex ); + const Feature* getFeatureForDescriptor( int descIndex ) const; }; } // namespace popsift From a893f439f8854219e602117e19ee02a49d08c30b Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Tue, 28 Sep 2021 15:22:04 +0200 Subject: [PATCH 21/30] [popsift] use CPU-side sorting in filtergrid --- src/popsift/filtergrid.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index b4b86e5b..6fa7e6b5 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -17,6 +17,7 @@ #include #include +#include namespace popsift { @@ -213,7 +214,7 @@ void FilterGrid::sort_by_cell_and_scale( ) * and never know the difference. */ CellScaleCompare tc( _cells, _scales ); -#if 0 +#if 1 cudaDeviceSynchronize(); int* ptr = _sorting_index; std::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); From 9f78dc54bc38dddab8d6684a571521ea3c26e1ff Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 1 Oct 2021 08:20:13 +0200 Subject: [PATCH 22/30] [filtergrid] allow case 1x1 filter grid --- src/popsift/common/debug_macros.h | 4 ++++ src/popsift/filtergrid.cu | 31 +++++++++++++------------------ src/popsift/filtergrid.h | 2 +- src/popsift/s_orientation.cu | 2 +- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/popsift/common/debug_macros.h b/src/popsift/common/debug_macros.h index 72eda129..26a92904 100755 --- a/src/popsift/common/debug_macros.h +++ b/src/popsift/common/debug_macros.h @@ -61,6 +61,10 @@ T* malloc_mgdT( int num, const char* file, int line ) { return (T*)ptr; } +inline void free_mgd( void* ptr ) { + cudaFree( ptr ); +} + void memcpy_sync( void* dst, const void* src, size_t sz, cudaMemcpyKind type, const char* file, size_t line ); diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 6fa7e6b5..90f139f7 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -30,8 +30,6 @@ FilterGrid::FilterGrid( const Config& conf ) , _scales( 0 ) , _initial_extrema_pointers( 0 ) { - if( _slots > 1 ) - { /* refresh the exclsuive prefix sum over the per-octave extrema counts */ _histogram = popsift::cuda::malloc_mgdT( MAX_OCTAVES*_slots, __FILE__, __LINE__); _histogram_full = popsift::cuda::malloc_mgdT( _slots, __FILE__, __LINE__); @@ -40,21 +38,17 @@ FilterGrid::FilterGrid( const Config& conf ) // initialize histogram to zero, implicitly move to device popcuda_memset_sync( _histogram, 0, MAX_OCTAVES * _slots * sizeof(int) ); popcuda_memset_sync( _histogram_full, 0, _slots * sizeof(int) ); - } } FilterGrid::~FilterGrid( ) { - if( _slots > 1 ) - { - cudaFree( _histogram_eps ); - cudaFree( _histogram_full ); - cudaFree( _histogram ); - cudaFree( _sorting_index ); - cudaFree( _cells ); - cudaFree( _scales ); - cudaFree( _initial_extrema_pointers ); - } + popsift::cuda::free_mgd( _histogram_eps ); + popsift::cuda::free_mgd( _histogram_full ); + popsift::cuda::free_mgd( _histogram ); + popsift::cuda::free_mgd( _sorting_index ); + popsift::cuda::free_mgd( _cells ); + popsift::cuda::free_mgd( _scales ); + popsift::cuda::free_mgd( _initial_extrema_pointers ); } __global__ void @@ -96,14 +90,15 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) _ct->make_extrema_prefix_sums(); + /* Allocate or reallocate memory to hold all indices */ if( _alloc_size < _ct->getTotalExtrema() ) { if( _alloc_size > 0 ) { - cudaFree( _sorting_index ); - cudaFree( _cells ); - cudaFree( _scales ); - cudaFree( _initial_extrema_pointers ); + popsift::cuda::free_mgd( _sorting_index ); + popsift::cuda::free_mgd( _cells ); + popsift::cuda::free_mgd( _scales ); + popsift::cuda::free_mgd( _initial_extrema_pointers ); } _alloc_size = _ct->getTotalExtrema(); _sorting_index = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); @@ -309,7 +304,7 @@ void FilterGrid::prune_extrema( GridFilterConfig::Mode mode ) /* Discard some extrema that exceed a conf.getFilterMaxExtrema(). * The caller should ensure that filtering is actually needed. */ __host__ -int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf, int ext_total ) +int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf ) { /* At this time, we have host-side information about ext_ct[o], the number * of extrema we have found in octave o, and we have summed it up on the diff --git a/src/popsift/filtergrid.h b/src/popsift/filtergrid.h index b4f987e0..405b4bcb 100755 --- a/src/popsift/filtergrid.h +++ b/src/popsift/filtergrid.h @@ -40,7 +40,7 @@ class FilterGrid ~FilterGrid( ); - int filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf, int ext_total ); + int filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* buf ); private: /** Initialize the FilterGrid structure with the current counters diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index dc1d3029..596bdbb3 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -330,7 +330,7 @@ void Pyramid::orientation( const Config& conf ) // therefore add 10% slack. if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < _ct->getTotalExtrema() ) { - _fg.filter( conf, _ct, _buf, _ct->getTotalExtrema() ); + _fg.filter( conf, _ct, _buf ); } reallocExtrema( _ct->getTotalExtrema() ); From eb927004e2d3d12bed370657e441356898e7db3f Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 1 Oct 2021 08:22:59 +0200 Subject: [PATCH 23/30] [filtergrid] generalize prefix sum of initial extrema count --- src/popsift/filtergrid.cu | 2 -- src/popsift/filtergrid.h | 1 - src/popsift/s_extrema.cu | 10 ++++++++++ src/popsift/s_orientation.cu | 4 ---- src/popsift/sift_pyramid.cu | 2 ++ src/popsift/sift_pyramid.h | 31 +++++++++++++++++++------------ 6 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 90f139f7..4e93ec5f 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -87,8 +87,6 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) { _buf = buf; _ct = ct; - _ct->make_extrema_prefix_sums(); - /* Allocate or reallocate memory to hold all indices */ if( _alloc_size < _ct->getTotalExtrema() ) diff --git a/src/popsift/filtergrid.h b/src/popsift/filtergrid.h index 405b4bcb..e1b19255 100755 --- a/src/popsift/filtergrid.h +++ b/src/popsift/filtergrid.h @@ -45,7 +45,6 @@ class FilterGrid private: /** Initialize the FilterGrid structure with the current counters * and buffers. - * Compute the exclusive prefix sum of extrema per octave. */ void init( ExtremaBuffers* buf, ExtremaCounters* ct ); diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index 0e1ab18b..d79daf15 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -649,5 +649,15 @@ void Pyramid::find_extrema( const Config& conf ) } } +void Pyramid::finalize_extrema( ) +{ + /* Complete the extrema computation by computing the prefix sum of + * extrema counts for all octaves. + */ + readDescCountersFromDevice( ); + + _ct->make_extrema_prefix_sums(); +} + } // namespace popsift diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index 596bdbb3..401a897c 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -322,10 +322,6 @@ void ori_prefix_sum( ExtremaCounters* ct, ExtremaBuffers* buf ) __host__ void Pyramid::orientation( const Config& conf ) { - readDescCountersFromDevice( ); - - _ct->make_extrema_prefix_sums(); - // Filter functions are only called if necessary. They are very expensive, // therefore add 10% slack. if( conf.getFilterMaxExtrema() > 0 && int(conf.getFilterMaxExtrema()*1.1) < _ct->getTotalExtrema() ) diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index e8c6a6ce..9547028a 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -168,6 +168,8 @@ void Pyramid::step2( const Config& conf ) { find_extrema( conf ); + finalize_extrema( ); + orientation( conf ); descriptors( conf ); diff --git a/src/popsift/sift_pyramid.h b/src/popsift/sift_pyramid.h index 4751b814..24045114 100755 --- a/src/popsift/sift_pyramid.h +++ b/src/popsift/sift_pyramid.h @@ -32,9 +32,9 @@ struct ExtremaCounters private: /* Exclusive prefix sum of ext_ct */ - int ext_ps[MAX_OCTAVES+1]; + int ext_prefix_sum[MAX_OCTAVES+1]; /* Exclusive prefix sum of ori_ct */ - int ori_ps[MAX_OCTAVES+1]; + int ori_prefix_sum[MAX_OCTAVES+1]; public: /** host and device function helper function that updates the exclusive @@ -46,19 +46,25 @@ struct ExtremaCounters __device__ __host__ inline int make_extrema_prefix_sums( ) { - ext_ps[0] = 0; + ext_prefix_sum[0] = 0; for( int o=1; o<=MAX_OCTAVES; o++ ) { - ext_ps[o] = ext_ps[o-1] + ext_ct[o-1]; + ext_prefix_sum[o] = ext_prefix_sum[o-1] + ext_ct[o-1]; } - return ext_ps[MAX_OCTAVES]; + return ext_prefix_sum[MAX_OCTAVES]; } /** get total number of extrema */ __device__ __host__ inline int getTotalExtrema( ) const { - return ext_ps[MAX_OCTAVES]; + return ext_prefix_sum[MAX_OCTAVES]; + } + + __device__ __host__ inline + int getExtremaCount( int octave ) const + { + return ext_ct[octave]; } /** in a sorted array of extrema, get the base index for the entries @@ -66,26 +72,26 @@ struct ExtremaCounters __device__ __host__ inline int getExtremaBase( const int& octave ) const { - return ext_ps[octave]; + return ext_prefix_sum[octave]; } /** compute the prefix sum and total sum of orientation count per octave */ __device__ __host__ inline int make_orientation_prefix_sums( ) { - ori_ps[0] = 0; + ori_prefix_sum[0] = 0; for( int o=1; o<=MAX_OCTAVES; o++ ) { - ori_ps[o] = ori_ps[o-1] + ori_ct[o-1]; + ori_prefix_sum[o] = ori_prefix_sum[o-1] + ori_ct[o-1]; } - return ori_ps[MAX_OCTAVES]; + return ori_prefix_sum[MAX_OCTAVES]; } /** get total number of orientations */ __device__ __host__ inline int getTotalOrientations( ) const { - return ori_ps[MAX_OCTAVES]; + return ori_prefix_sum[MAX_OCTAVES]; } /** in a sorted array of orientations, get the base index for the entries @@ -93,7 +99,7 @@ struct ExtremaCounters __device__ __host__ inline int getOrientationBase( const int& octave ) const { - return ori_ps[octave]; + return ori_prefix_sum[octave]; } }; @@ -225,6 +231,7 @@ class Pyramid void reset_extrema_mgmt( ); void build_pyramid( const Config& conf, ImageBase* base ); void find_extrema( const Config& conf ); + void finalize_extrema( ); void reallocExtrema( int numExtrema ); void orientation( const Config& conf ); From 60def0ea522273ead72d1f65331a5067d9090f0d Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Fri, 1 Oct 2021 13:29:01 +0200 Subject: [PATCH 24/30] [filtergrid] detailed debugging added --- src/CMakeLists.txt | 1 + src/popsift/filtergrid.cu | 86 +++++++++- src/popsift/filtergrid_debug.cu | 278 ++++++++++++++++++++++++++++++++ src/popsift/filtergrid_debug.h | 32 ++++ 4 files changed, 389 insertions(+), 8 deletions(-) create mode 100755 src/popsift/filtergrid_debug.cu create mode 100755 src/popsift/filtergrid_debug.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a6385fb8..a5391293 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,6 +6,7 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_conf.cu popsift/sift_conf.h popsift/gauss_filter.cu popsift/gauss_filter.h popsift/filtergrid.cu popsift/filtergrid.h + popsift/filtergrid_debug.cu popsift/filtergrid_debug.h popsift/initial_extremum.h popsift/s_image.cu popsift/s_image.h popsift/sift_pyramid.cu popsift/sift_pyramid.h diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 4e93ec5f..085032ca 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -7,6 +7,8 @@ */ #include "filtergrid.h" +#include "filtergrid_debug.h" + #include "sift_config.h" #include "sift_extremum.h" #include "sift_pyramid.h" @@ -88,17 +90,19 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) _buf = buf; _ct = ct; + const int extrema_ct_total = _ct->getTotalExtrema(); + /* Allocate or reallocate memory to hold all indices */ - if( _alloc_size < _ct->getTotalExtrema() ) + if( _alloc_size < extrema_ct_total ) { if( _alloc_size > 0 ) { popsift::cuda::free_mgd( _sorting_index ); - popsift::cuda::free_mgd( _cells ); - popsift::cuda::free_mgd( _scales ); - popsift::cuda::free_mgd( _initial_extrema_pointers ); + popsift::cuda::free_mgd( _cells ); + popsift::cuda::free_mgd( _scales ); + popsift::cuda::free_mgd( _initial_extrema_pointers ); } - _alloc_size = _ct->getTotalExtrema(); + _alloc_size = extrema_ct_total; _sorting_index = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); _cells = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); _scales = popsift::cuda::malloc_mgdT( _alloc_size, __FILE__, __LINE__); @@ -109,10 +113,15 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) for( int o=0; oext_ct[o] == 0 ) continue; + const int extrema_ct_in_octave = _ct->getExtremaCount(o); + const int extrema_base_in_octave = _ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; +std::cerr << " " << extrema_ct_in_octave << " extrema in octave " << o + << ", base offset " << extrema_base_in_octave << std::endl; dim3 block( 32 ); - dim3 grid( grid_divide( _ct->ext_ct[o], block.x ) ); + dim3 grid( grid_divide( extrema_ct_in_octave, block.x ) ); fg_init <<>> ( o, @@ -131,7 +140,7 @@ fg_countcells( const int ext_total, const int cell_histogram_size, int* cell_histogram ) { - // The size of this chared memory region is computed from _slots in the + // The size of this shared memory region is computed from _slots in the // calling host code. extern __shared__ int cellcounts[]; @@ -317,10 +326,71 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* init( buf, ct ); count_cells( ); make_histogram_prefix_sum( ); + + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintHistogram, + "**********************************************\n" + "* Printing the histogram of cells *\n" + "**********************************************\n"); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintUnsortedByOctave, + "**********************************************\n" + "* Printing all values sorted by their octave *\n" + "**********************************************\n"); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintUnsortedFlat, + "****************************************************************************\n" + "* Printing all values flat and unsorted in the entire array before sorting *\n" + "****************************************************************************\n"); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintSortedFlat, + "********************************************************************************\n" + "* Printing all values flat by sorting index in the entire array before sorting *\n" + "********************************************************************************\n"); + sort_by_cell_and_scale( ); + + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintSortedFlat, + "********************************************************************************\n" + "* Printing all values flat by sorting index in the entire array after sorting *\n" + "********************************************************************************\n"); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintRest, + "********************************************************************************\n" + "* Printing all values flat by sorting index in the entire array after sorting *\n" + "********************************************************************************\n"); + level_histogram( max_extrema ); + + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintHistogram, + "**************************************************************\n" + "* Printing the histogram of cells after levelling *\n" + "**************************************************************\n"); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintRest, + "********************************************************************************\n" + "* Printing all values flat by sorting index in the entire array after sorting *\n" + "********************************************************************************\n"); + prune_extrema( conf.getFilterSorting() ); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintRest, + "********************************************************************************\n" + "* Printing all values flat by sorting index in the entire array after pruning *\n" + "********************************************************************************\n"); + for( int o=0; oext_ct[o]; diff --git a/src/popsift/filtergrid_debug.cu b/src/popsift/filtergrid_debug.cu new file mode 100755 index 00000000..abacdc31 --- /dev/null +++ b/src/popsift/filtergrid_debug.cu @@ -0,0 +1,278 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "filtergrid_debug.h" + +#include +#include + +namespace popsift +{ + +void debug_arrays( int* sorting_index, int* cell_array, float* scale_array, InitialExtremum** ptr_array, + ExtremaCounters* ct, + int num_cells, int* samples_in_cell, int* samples_prefix_sum, + DebugArraysMode mode, + const std::string& intro ) +{ +std::cerr << " Enter " << __FUNCTION__ << std::endl; +std::cerr << intro << std::endl; + cudaDeviceSynchronize(); + + /***************************/ + /* printing cell histogram */ + /***************************/ + if( mode == PrintHistogram ) + { + int sum = 0; + for( int i=0; igetTotalExtrema(); + + /**************************/ + /* printing sorting index */ + /**************************/ + if( mode == PrintHistogram ) + { + } + else if( mode == PrintUnsortedByOctave ) + { + for( int o=0; ogetExtremaCount(o); + const int extrema_base_in_octave = ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; + + std::cerr << " Indexes in octave " << o << ": "; + for( int i=0; igetExtremaCount(o); + const int extrema_base_in_octave = ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; + + std::cerr << " cell values in octave " << o << ": "; + for( int i=0; igetExtremaCount(o); + const int extrema_base_in_octave = ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; + + std::cerr << " scale values in octave " << o << ": "; + for( int i=0; igetExtremaCount(o); + const int extrema_base_in_octave = ct->getExtremaBase(o); + + if( extrema_ct_in_octave == 0 ) continue; + + std::cerr << " ignore values in octave " << o << ": "; + for( int i=0; iisIgnored() << " "; + } + std::cerr << std::endl; + } + } + else if( mode == PrintUnsortedFlat ) + { + std::cerr << " All ignore values: "; + for( int i=0; iisIgnored() << " "; + } + std::cerr << std::endl; + } + else if( mode == PrintSortedFlat ) + { + std::cerr << " All ignore values by sorting index: "; + for( int i=0; iisIgnored() << " "; + } + std::cerr << std::endl + << std::endl; + } + else + { + for( int c=0; cisIgnored() << " "; + } + std::cerr << std::endl; + } + } +std::cerr << " Leave " << __FUNCTION__ << std::endl; +} + +}; // namespace popsift + diff --git a/src/popsift/filtergrid_debug.h b/src/popsift/filtergrid_debug.h new file mode 100755 index 00000000..f6d787f6 --- /dev/null +++ b/src/popsift/filtergrid_debug.h @@ -0,0 +1,32 @@ +/* + * Copyright 2021, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once + +#include "sift_pyramid.h" + +#include + +namespace popsift +{ + +enum DebugArraysMode +{ + PrintHistogram, + PrintUnsortedByOctave, + PrintUnsortedFlat, + PrintSortedFlat, + PrintRest +}; + +void debug_arrays( int* sorting_index, int* cell_array, float* scale_array, InitialExtremum** ptr_array, + ExtremaCounters* ct, + int num_cells, int* samples_in_cell, int* samples_prefix_sum, + DebugArraysMode mode, + const std::string& intro ); +}; + From bb41846ae0b04802d25b9e8c5f2e8e184bab5fad Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 11 Oct 2021 09:36:21 +0200 Subject: [PATCH 25/30] [cuda] found the filtergrid bug --- src/popsift/filtergrid.cu | 37 ++++++-- src/popsift/filtergrid_debug.cu | 155 ++++++++++++++++++-------------- 2 files changed, 117 insertions(+), 75 deletions(-) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 085032ca..114a1d8b 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -117,7 +117,7 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) const int extrema_base_in_octave = _ct->getExtremaBase(o); if( extrema_ct_in_octave == 0 ) continue; -std::cerr << " " << extrema_ct_in_octave << " extrema in octave " << o +std::cout << " " << extrema_ct_in_octave << " extrema in octave " << o << ", base offset " << extrema_base_in_octave << std::endl; dim3 block( 32 ); @@ -145,7 +145,13 @@ fg_countcells( const int ext_total, extern __shared__ int cellcounts[]; const int tid = threadIdx.x; - const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int idx = blockIdx.x * blockDim.x + tid; + + for( int c=tid; c= ext_total ) return; @@ -162,12 +168,16 @@ fg_countcells( const int ext_total, void FilterGrid::count_cells( ) { + const int total_extrema_count = _ct->getTotalExtrema(); + + std::cout << "Counting cells for a total of " << total_extrema_count << " extrema" << std::endl; + dim3 block( 32 ); - dim3 grid( grid_divide( _ct->getTotalExtrema(), block.x ) ); + dim3 grid( grid_divide( total_extrema_count, block.x ) ); fg_countcells <<>> - ( _ct->getTotalExtrema(), + ( total_extrema_count, _cells, _slots, _histogram_full ); @@ -379,11 +389,17 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* _slots, _histogram_full, _histogram_eps, PrintRest, "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array after sorting *\n" + "* Printing all values flat by sorting index in the entire array after leveling *\n" "********************************************************************************\n"); prune_extrema( conf.getFilterSorting() ); + debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, + _slots, _histogram_full, _histogram_eps, + PrintHistogram, + "**************************************************************\n" + "* Printing the histogram of cells after pruning *\n" + "**************************************************************\n"); debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, _slots, _histogram_full, _histogram_eps, PrintRest, @@ -391,9 +407,12 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* "* Printing all values flat by sorting index in the entire array after pruning *\n" "********************************************************************************\n"); + int debug_sum = 0; for( int o=0; oext_ct[o]; + // const int max_ct = ct->ext_ct[o]; + const int max_ct = ct->getExtremaCount(o); + int counter = 0; for( int i=0; iext_ct[o] = counter; } + std::cout << "The total number of initial extrema not ignored is " << debug_sum << std::endl; ct->make_extrema_prefix_sums(); diff --git a/src/popsift/filtergrid_debug.cu b/src/popsift/filtergrid_debug.cu index abacdc31..f08255d7 100755 --- a/src/popsift/filtergrid_debug.cu +++ b/src/popsift/filtergrid_debug.cu @@ -19,8 +19,8 @@ void debug_arrays( int* sorting_index, int* cell_array, float* scale_array, Init DebugArraysMode mode, const std::string& intro ) { -std::cerr << " Enter " << __FUNCTION__ << std::endl; -std::cerr << intro << std::endl; +std::cout << " Enter " << __FUNCTION__ << std::endl; +std::cout << intro << std::endl; cudaDeviceSynchronize(); /***************************/ @@ -28,19 +28,33 @@ std::cerr << intro << std::endl; /***************************/ if( mode == PrintHistogram ) { - int sum = 0; + int sum_extrema_in_octave = 0; + for( int o=0; ogetExtremaCount(o); + if( count != 0 ) + { + std::cout << " Extrema in octave " << o << ":" + << " base=" << ct->getExtremaBase(o) + << " count=" << count << std::endl; + sum_extrema_in_octave += count; + } + } + std::cout << " Extrema in image " << sum_extrema_in_octave << std::endl << std::endl; + + int sum_samples_in_cells = 0; for( int i=0; igetTotalExtrema(); @@ -60,29 +74,29 @@ std::cerr << intro << std::endl; if( extrema_ct_in_octave == 0 ) continue; - std::cerr << " Indexes in octave " << o << ": "; + std::cout << " Indexes in octave " << o << ": "; for( int i=0; iisIgnored() << " "; + if( i % 30 == 0 ) std::cout << std::endl << " "; + std::cout << ptr_array[extrema_base_in_octave + i]->isIgnored() << " "; } - std::cerr << std::endl; + std::cout << std::endl; } } else if( mode == PrintUnsortedFlat ) { - std::cerr << " All ignore values: "; + std::cout << " All ignore values: "; for( int i=0; iisIgnored() << " "; + if( i % 30 == 0 ) std::cout << std::endl << " "; + std::cout << ptr_array[i]->isIgnored() << " "; } - std::cerr << std::endl; + std::cout << std::endl; } else if( mode == PrintSortedFlat ) { - std::cerr << " All ignore values by sorting index: "; + std::cout << " All ignore values by sorting index: "; for( int i=0; iisIgnored() << " "; + if( i % 30 == 0 ) std::cout << std::endl << " "; + std::cout << ptr_array[sorting_index[i]]->isIgnored() << " "; } - std::cerr << std::endl + std::cout << std::endl << std::endl; } else { for( int c=0; cisIgnored() << " "; + if( i % 30 == 0 ) std::cout << std::endl << " "; + std::cout << ptr_array[sorting_index[samples_prefix_sum[c] + i]]->isIgnored() << " "; } - std::cerr << std::endl; + std::cout << std::endl; } } -std::cerr << " Leave " << __FUNCTION__ << std::endl; +std::cout << " Leave " << __FUNCTION__ << std::endl; } }; // namespace popsift From 752e88b9ea7f69f4a848b7a8d5c45a74f9a668ad Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 11 Oct 2021 09:38:23 +0200 Subject: [PATCH 26/30] [cuda] remove the filtergrid debug output --- src/CMakeLists.txt | 1 - src/popsift/filtergrid.cu | 78 --------- src/popsift/filtergrid_debug.cu | 295 -------------------------------- src/popsift/filtergrid_debug.h | 32 ---- 4 files changed, 406 deletions(-) delete mode 100755 src/popsift/filtergrid_debug.cu delete mode 100755 src/popsift/filtergrid_debug.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5391293..a6385fb8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -6,7 +6,6 @@ CUDA_ADD_LIBRARY(popsift popsift/sift_conf.cu popsift/sift_conf.h popsift/gauss_filter.cu popsift/gauss_filter.h popsift/filtergrid.cu popsift/filtergrid.h - popsift/filtergrid_debug.cu popsift/filtergrid_debug.h popsift/initial_extremum.h popsift/s_image.cu popsift/s_image.h popsift/sift_pyramid.cu popsift/sift_pyramid.h diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index 114a1d8b..c8ed5e1b 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -7,8 +7,6 @@ */ #include "filtergrid.h" -#include "filtergrid_debug.h" - #include "sift_config.h" #include "sift_extremum.h" #include "sift_pyramid.h" @@ -117,8 +115,6 @@ void FilterGrid::init( ExtremaBuffers* buf, ExtremaCounters* ct ) const int extrema_base_in_octave = _ct->getExtremaBase(o); if( extrema_ct_in_octave == 0 ) continue; -std::cout << " " << extrema_ct_in_octave << " extrema in octave " << o - << ", base offset " << extrema_base_in_octave << std::endl; dim3 block( 32 ); dim3 grid( grid_divide( extrema_ct_in_octave, block.x ) ); @@ -170,8 +166,6 @@ void FilterGrid::count_cells( ) { const int total_extrema_count = _ct->getTotalExtrema(); - std::cout << "Counting cells for a total of " << total_extrema_count << " extrema" << std::endl; - dim3 block( 32 ); dim3 grid( grid_divide( total_extrema_count, block.x ) ); @@ -337,80 +331,14 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* count_cells( ); make_histogram_prefix_sum( ); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintHistogram, - "**********************************************\n" - "* Printing the histogram of cells *\n" - "**********************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintUnsortedByOctave, - "**********************************************\n" - "* Printing all values sorted by their octave *\n" - "**********************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintUnsortedFlat, - "****************************************************************************\n" - "* Printing all values flat and unsorted in the entire array before sorting *\n" - "****************************************************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintSortedFlat, - "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array before sorting *\n" - "********************************************************************************\n"); - sort_by_cell_and_scale( ); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintSortedFlat, - "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array after sorting *\n" - "********************************************************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintRest, - "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array after sorting *\n" - "********************************************************************************\n"); - level_histogram( max_extrema ); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintHistogram, - "**************************************************************\n" - "* Printing the histogram of cells after levelling *\n" - "**************************************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintRest, - "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array after leveling *\n" - "********************************************************************************\n"); - prune_extrema( conf.getFilterSorting() ); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintHistogram, - "**************************************************************\n" - "* Printing the histogram of cells after pruning *\n" - "**************************************************************\n"); - debug_arrays( _sorting_index, _cells, _scales, _initial_extrema_pointers, _ct, - _slots, _histogram_full, _histogram_eps, - PrintRest, - "********************************************************************************\n" - "* Printing all values flat by sorting index in the entire array after pruning *\n" - "********************************************************************************\n"); - - int debug_sum = 0; for( int o=0; oext_ct[o]; const int max_ct = ct->getExtremaCount(o); int counter = 0; @@ -423,14 +351,8 @@ int FilterGrid::filter( const Config& conf, ExtremaCounters* ct, ExtremaBuffers* counter += 1; } } - if( counter != 0 ) - { - std::cout << "The number of initial extrema not ignored in octave " << o << " is " << counter << std::endl; - debug_sum += counter; - } ct->ext_ct[o] = counter; } - std::cout << "The total number of initial extrema not ignored is " << debug_sum << std::endl; ct->make_extrema_prefix_sums(); diff --git a/src/popsift/filtergrid_debug.cu b/src/popsift/filtergrid_debug.cu deleted file mode 100755 index f08255d7..00000000 --- a/src/popsift/filtergrid_debug.cu +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright 2021, University of Oslo - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ -#include "filtergrid_debug.h" - -#include -#include - -namespace popsift -{ - -void debug_arrays( int* sorting_index, int* cell_array, float* scale_array, InitialExtremum** ptr_array, - ExtremaCounters* ct, - int num_cells, int* samples_in_cell, int* samples_prefix_sum, - DebugArraysMode mode, - const std::string& intro ) -{ -std::cout << " Enter " << __FUNCTION__ << std::endl; -std::cout << intro << std::endl; - cudaDeviceSynchronize(); - - /***************************/ - /* printing cell histogram */ - /***************************/ - if( mode == PrintHistogram ) - { - int sum_extrema_in_octave = 0; - for( int o=0; ogetExtremaCount(o); - if( count != 0 ) - { - std::cout << " Extrema in octave " << o << ":" - << " base=" << ct->getExtremaBase(o) - << " count=" << count << std::endl; - sum_extrema_in_octave += count; - } - } - std::cout << " Extrema in image " << sum_extrema_in_octave << std::endl << std::endl; - - int sum_samples_in_cells = 0; - for( int i=0; igetTotalExtrema(); - - /**************************/ - /* printing sorting index */ - /**************************/ - if( mode == PrintHistogram ) - { - } - else if( mode == PrintUnsortedByOctave ) - { - for( int o=0; ogetExtremaCount(o); - const int extrema_base_in_octave = ct->getExtremaBase(o); - - if( extrema_ct_in_octave == 0 ) continue; - - std::cout << " Indexes in octave " << o << ": "; - for( int i=0; igetExtremaCount(o); - const int extrema_base_in_octave = ct->getExtremaBase(o); - - if( extrema_ct_in_octave == 0 ) continue; - - std::cout << " cell values in octave " << o << ": "; - for( int i=0; igetExtremaCount(o); - const int extrema_base_in_octave = ct->getExtremaBase(o); - - if( extrema_ct_in_octave == 0 ) continue; - - std::cout << " scale values in octave " << o << ": "; - for( int i=0; igetExtremaCount(o); - const int extrema_base_in_octave = ct->getExtremaBase(o); - - if( extrema_ct_in_octave == 0 ) continue; - - std::cout << " ignore values in octave " << o << ": "; - for( int i=0; iisIgnored() << " "; - } - std::cout << std::endl; - } - } - else if( mode == PrintUnsortedFlat ) - { - std::cout << " All ignore values: "; - for( int i=0; iisIgnored() << " "; - } - std::cout << std::endl; - } - else if( mode == PrintSortedFlat ) - { - std::cout << " All ignore values by sorting index: "; - for( int i=0; iisIgnored() << " "; - } - std::cout << std::endl - << std::endl; - } - else - { - for( int c=0; cisIgnored() << " "; - } - std::cout << std::endl; - } - } -std::cout << " Leave " << __FUNCTION__ << std::endl; -} - -}; // namespace popsift - diff --git a/src/popsift/filtergrid_debug.h b/src/popsift/filtergrid_debug.h deleted file mode 100755 index f6d787f6..00000000 --- a/src/popsift/filtergrid_debug.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright 2021, University of Oslo - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. - */ -#pragma once - -#include "sift_pyramid.h" - -#include - -namespace popsift -{ - -enum DebugArraysMode -{ - PrintHistogram, - PrintUnsortedByOctave, - PrintUnsortedFlat, - PrintSortedFlat, - PrintRest -}; - -void debug_arrays( int* sorting_index, int* cell_array, float* scale_array, InitialExtremum** ptr_array, - ExtremaCounters* ct, - int num_cells, int* samples_in_cell, int* samples_prefix_sum, - DebugArraysMode mode, - const std::string& intro ); -}; - From b804c055ae8bf1c037de5268aa4858dbf8ac831f Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 11 Oct 2021 10:22:11 +0200 Subject: [PATCH 27/30] [thrust] use thrust by default in filtergrid --- src/popsift/filtergrid.cu | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index c8ed5e1b..a695d48f 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -15,9 +15,16 @@ #if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) +#include + +#define PREFER_CPU 0 + +#if PREFER_CPU == 0 +#include +#include #include #include -#include +#endif namespace popsift { @@ -180,6 +187,15 @@ void FilterGrid::count_cells( ) void FilterGrid::make_histogram_prefix_sum( ) { // We could to this on the device ! +#if PREFER_CPU == 0 + popcuda_memset_sync( _histogram_eps, 0, (_slots+1) * sizeof(int) ); + + thrust::device_ptr src_begin = thrust::device_pointer_cast( &_histogram_full[0] ); + thrust::device_ptr src_end = thrust::device_pointer_cast( &_histogram_full[_slots] ); + thrust::device_ptr dst_begin = thrust::device_pointer_cast( &_histogram_eps[1] ); + + thrust::inclusive_scan( src_begin, src_end, dst_begin ); +#else cudaDeviceSynchronize(); _histogram_eps[0] = 0; @@ -188,6 +204,7 @@ void FilterGrid::make_histogram_prefix_sum( ) { _histogram_eps[i] = _histogram_eps[i-1] + _histogram_full[i-1]; } +#endif } @@ -220,13 +237,13 @@ void FilterGrid::sort_by_cell_and_scale( ) * and never know the difference. */ CellScaleCompare tc( _cells, _scales ); -#if 1 +#if PREFER_CPU == 0 + thrust::device_ptr ptr = thrust::device_pointer_cast( _sorting_index ); + thrust::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); +#else cudaDeviceSynchronize(); int* ptr = _sorting_index; std::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); -#else - thrust::device_ptr ptr = thrust::device_pointer_cast( _sorting_index ); - thrust::sort( ptr, ptr + _ct->getTotalExtrema(), tc ); #endif } From ed12e2c919a7d7be463cc801dea0db79eb0f92d9 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 11 Oct 2021 10:31:15 +0200 Subject: [PATCH 28/30] [thrust] add include apparently required by some --- src/popsift/filtergrid.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/popsift/filtergrid.cu b/src/popsift/filtergrid.cu index a695d48f..f74ea8b5 100755 --- a/src/popsift/filtergrid.cu +++ b/src/popsift/filtergrid.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #endif From f9ba2749d8b6f498d00106c90e1177f3bd4c3e0d Mon Sep 17 00:00:00 2001 From: Simone Gasparini Date: Thu, 28 Jul 2022 16:49:49 +0200 Subject: [PATCH 29/30] [doc] update changelog --- CHANGES.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 4a5ee214..dff76886 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -25,10 +25,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65) - Support for device selection and multiple GPUs [PR](https://github.com/alicevision/popsift/pull/121) - Test: adding descriptor comparator [PR](https://github.com/alicevision/popsift/pull/115) +- Support descriptor computation compliant with VLFeat [PR](https://github.com/alicevision/popsift/pull/133) +- Support for ignoring Edge and Peak Thresholds [PR](https://github.com/alicevision/popsift/pull/133) +- Support for grid filtering on CUDA Managed Memory and sorting initial extrema on the CPU side [PR](https://github.com/alicevision/popsift/pull/133) ### Fixed - CMake: fixes to allow building on Windows using vcpkg [PR](https://github.com/alicevision/popsift/pull/92) - Fix race condition [PR](https://github.com/alicevision/popsift/pull/82) +- Fixed a bug in L2 normalization [PR](https://github.com/alicevision/popsift/pull/133) ### Changed - Improved resource releasing [PR](https://github.com/alicevision/popsift/pull/71) From 604295a2d9b049d30a6aaf225ec2ef1eb386b790 Mon Sep 17 00:00:00 2001 From: Simone Gasparini Date: Thu, 28 Jul 2022 16:52:15 +0200 Subject: [PATCH 30/30] [doc] fix changelog --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index dff76886..1ec43666 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,7 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for Cuda CC 7 cards (RTX 2080) [PR](https://github.com/alicevision/popsift/pull/67) - Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65) - Support for device selection and multiple GPUs [PR](https://github.com/alicevision/popsift/pull/121) -- Test: adding descriptor comparator [PR](https://github.com/alicevision/popsift/pull/115) +- Test: adding descriptor comparator [PR](https://github.com/alicevision/popsift/pull/133) - Support descriptor computation compliant with VLFeat [PR](https://github.com/alicevision/popsift/pull/133) - Support for ignoring Edge and Peak Thresholds [PR](https://github.com/alicevision/popsift/pull/133) - Support for grid filtering on CUDA Managed Memory and sorting initial extrema on the CPU side [PR](https://github.com/alicevision/popsift/pull/133)