Skip to content
Permalink
Browse files

Huge speed increase by converting findAudioDuplicate algorithm to C++

Add a FingerprintManager class in C++ that has a addSongAndCompare
method, accepting a songID, a fingerprint and a cancellation similarity
threshold. This method iterates over all songs already added to the
object's internal map/dictionary comparing them to the new added song.
Finally the song is added to the internal map and a list of tuples
is returned where each tuple is a matching song (songID, offset,
similarity).

This reduces the number of iterations/searches over the dictionary
of songs.

As a side effect, this fixes a problem with the original implementation
when two songs had exactly the same fingerprint, since the dictionary
was indexed by fingerprint, so only one of such songs was compared, thus
ignoring positive song matches with similarity = 1.0 .

Also, songs are added to the internal dictionary with a prepadding
of m_maxoffset zeros, so there's no need to manipulate data for each
comparison. In addition, the compareChromaprintFingerprintsAndOffset
method uses iterators to traverse the fingerprints (which is much
faster than index based accesses) and uses gcc's __builtin_popcount
since FingerprintManager already uses regular int values for
fingerprints.

If commit 8dffcdf decreased
the time it took for a test run of 4759 songs from 215 seconds
to 126 seconds, this commit increases the speed quite a lot more so
the same test run takes 1.7 seconds.

Note: bard_ext.cpp uses C++17 and boost::python.

>>> p.print_stats()
Mon Aug  7 14:17:39 2017    profile_bard_cpp

         242517 function calls (235919 primitive calls) in 1.987 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    424/1    0.007    0.000    1.988    1.988 {built-in method builtins.exec}
        1    0.000    0.000    1.988    1.988 /usr/bin/bard:2(<module>)
        1    0.000    0.000    1.788    1.788 /usr/lib64/python3.6/site-packages/bard/bard.py:905(main)
        1    0.040    0.040    1.788    1.788 /usr/lib64/python3.6/site-packages/bard/bard.py:735(parseCommandLine)
        1    0.898    0.898    1.742    1.742 /usr/lib64/python3.6/site-packages/bard/bard.py:585(findAudioDuplicates2)
       55    0.643    0.012    0.643    0.012 {method 'execute' of 'sqlite3.Cursor' objects}
    451/2    0.001    0.000    0.200    0.100 <frozen importlib._bootstrap>:958(_find_and_load)
    451/2    0.001    0.000    0.200    0.100 <frozen importlib._bootstrap>:931(_find_and_load_unlocked)
    575/3    0.000    0.000    0.200    0.067 <frozen importlib._bootstrap>:197(_call_with_frames_removed)
    379/1    0.000    0.000    0.200    0.200 {built-in method builtins.__import__}
    438/2    0.001    0.000    0.199    0.099 <frozen importlib._bootstrap>:641(_load_unlocked)
    392/1    0.001    0.000    0.198    0.198 <frozen importlib._bootstrap_external>:672(exec_module)

Compare to the previous times:

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    424/1    0.007    0.000  127.292  127.292 {built-in method builtins.exec}
        1    0.000    0.000  127.292  127.292 /usr/bin/bard:2(<module>)
        1    0.000    0.000  127.099  127.099 /home/antonio/git/antlarr/bard/bard/bard.py:847(main)
        1    0.128    0.128  127.099  127.099 /home/antonio/git/antlarr/bard/bard/bard.py:677(parseCommandLine)
        1    0.084    0.084  126.966  126.966 /home/antonio/git/antlarr/bard/bard/bard.py:509(findAudioDuplicates)
     3578    0.300    0.000  123.672    0.035 /home/antonio/git/antlarr/bard/bard/bard.py:105(compareChromaprintFingerprintsAndOffset2)
   354222   98.899    0.000  123.290    0.000 /home/antonio/git/antlarr/bard/bard/bard.py:38(compareChromaprintFingerprints)
299543702   24.271    0.000   24.271    0.000 {built-in method gmpy.popcount}
     3616    1.857    0.001    1.857    0.001 /home/antonio/git/antlarr/bard/bard/bard.py:526(<listcomp>)
  • Loading branch information...
antlarr committed Aug 7, 2017
1 parent 3ff36ae commit 1e9efbe8e054b4610f3f05f890fc286c9c086b2d
Showing with 301 additions and 2 deletions.
  1. +67 −1 bard/bard.py
  2. +222 −0 bard/bard_ext.cpp
  3. +12 −1 setup.py
@@ -582,6 +582,72 @@ def findAudioDuplicates(self, from_song_id=0):
decodedFPs[fingerprint] = dfp
info[songID] = (sha256sum, audioSha256sum, path, completeness)

def findAudioDuplicates2(self, from_song_id=None):
c = MusicDatabase.conn.cursor()
info = {}
matchThreshold = 0.8
storeThreshold = 0.56
if not from_song_id:
from_song_id = 0
from bard.bard_ext import FingerprintManager
fpm = FingerprintManager()
fpm.setMaxOffset(100)

sql = ('SELECT id, fingerprint, sha256sum, audio_sha256sum, path, '
'completeness FROM fingerprints, songs, checksums, '
'properties where songs.id=fingerprints.song_id and '
'songs.id = checksums.song_id and '
'songs.id = properties.song_id order by id')

for (songID, fingerprint, sha256sum, audioSha256sum, path,
completeness) in c.execute(sql):
# print('.', songID, end='', flush=True)
dfp = chromaprint.decode_fingerprint(fingerprint)
if not dfp[0]:
print("Error calculating fingerprint of song %s (%s)" %
(songID, path))
continue
if songID < from_song_id:
fpm.addSong(songID, dfp[0])
result = []
else:
# if songID > from_song_id:
# return
result = fpm.addSongAndCompare(songID, dfp[0], storeThreshold)

for (songID2, offset, similarity) in result:
print('******** %d %d %d %f' % (songID2, songID,
offset, similarity))
MusicDatabase.addSongsSimilarity(songID2, songID,
offset, similarity)

if similarity >= matchThreshold:
# print('''Duplicates found!\n''',
# songID, fingerprint, path)
# print('''Duplicates found!\n''', fp)
# print('''Duplicates found!\n''', fingerprints[fp])
(otherSha256sum, otherAudioSha256sum, otherPath,
otherCompleteness) = info[songID2]
if sha256sum == otherSha256sum:
msg = ('Exactly the same files (sha256 = %s)' %
sha256sum)
print('Duplicate songs found: %s\n'
'%s\n and %s' % (msg, otherPath, path))
elif audioSha256sum == otherAudioSha256sum:
msg = ('Same audio track with different tags '
'(completeness: %d <-> %d)' %
(otherCompleteness, completeness))
print('Duplicate songs found: %s\n'
'%s\n and %s''' % (msg, otherPath, path))
else:
msg = 'Similarity %f' % similarity
# print('Duplicate songs found: %s\n %s\n'
# 'and %s' % (msg, otherPath, path))
if result:
MusicDatabase.commit()

info[songID] = (sha256sum, audioSha256sum, path, completeness)

def getSongsFromIDorPath(self, id_or_path):
try:
songID = int(id_or_path)
@@ -820,7 +886,7 @@ def parseCommandLine(self):
elif options.command == 'check-checksums':
self.checkChecksums(options.from_song_id)
elif options.command == 'find-audio-duplicates':
self.findAudioDuplicates(options.from_song_id)
self.findAudioDuplicates2(options.from_song_id)
elif options.command == 'compare-songs':
self.compareSongIDsOrPaths(options.song1, options.song2)
elif options.command == 'compare-files':
@@ -0,0 +1,222 @@
/*
This file is part of Bard.
Bard is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

#include <boost/python/module.hpp>
#include <boost/python/def.hpp>
#include <boost/python/extract.hpp>
#include <boost/python/stl_iterator.hpp>
#include <boost/python/list.hpp>
#include <boost/python/tuple.hpp>
#include <boost/python/class.hpp>
#include <vector>
#include <map>
#include <iostream>

template<typename T>
inline
std::vector<T> to_std_vector( const boost::python::object& iterable )
{
return std::vector<T>( boost::python::stl_input_iterator<T>( iterable ),
boost::python::stl_input_iterator<T>( ) );
}

template<typename T>
long greet(boost::python::list &a)
{
auto v = to_std_vector<int>(a);
return v[0];
}

boost::python::tuple greet2()
{
int i1=-1442137496;
int i2=1786980374;
int a=__builtin_popcount(i1 ^ i2);
return boost::python::make_tuple(a,2,3);
}

class FingerprintManager
{
public:
FingerprintManager();
void setMaxOffset(int maxoffset);
int maxOffset() const;

void addSong(long songID, boost::python::list &fingerprint);
boost::python::list addSongAndCompare(long songID, boost::python::list &fingerprint, double cancelThreshold=0.55);
long compareSongs(long songID1, long songID2, int maxoffset=50, double cancelThreshold=0.55);
std::pair<int, double> compareChromaprintFingerprintsAndOffset(std::vector<int> fp1, std::vector<int> fp2, double cancelThreshold) const;

private:
int m_maxoffset;
std::map<int, std::vector<int> > m_fingerprints;
};

FingerprintManager::FingerprintManager(): m_maxoffset(50)
{
}


void FingerprintManager::setMaxOffset(int maxoffset)
{
m_maxoffset = maxoffset;
}

int FingerprintManager::maxOffset() const
{
return m_maxoffset;
}

void FingerprintManager::addSong(long songID, boost::python::list &fingerprint)
{
auto v = to_std_vector<int>(fingerprint);
v.insert(v.begin(), m_maxoffset, 0);
m_fingerprints[songID]=v;
}

boost::python::list FingerprintManager::addSongAndCompare(long songID, boost::python::list &fingerprint, double cancelThreshold)
{
boost::python::list result;
auto v = to_std_vector<int>(fingerprint);
// std::cout << "len: " << v.size() << std::endl;
v.insert(v.begin(), m_maxoffset, 0);
// std::cout << "new len: " << v.size() << std::endl;
for (auto & [itSongID, itFingerprint]: m_fingerprints)
{
auto [offset, similarity] = compareChromaprintFingerprintsAndOffset(itFingerprint, v, cancelThreshold);
if (similarity > cancelThreshold)
{
// std::cout << "****" << songID << " " << itSongID << " " << offset << " " << similarity << std::endl;
result.append(boost::python::make_tuple(itSongID, offset, similarity));
} /*else {
if (similarity < 0 )
std::cout << songID << " " << itSongID << " different" << std::endl;
else
std::cout << songID << " " << itSongID << " " << offset << " " << similarity << std::endl;
}*/

}

m_fingerprints[songID]=v;
return result;
}

std::pair<int, double> FingerprintManager::compareChromaprintFingerprintsAndOffset(std::vector<int> fp1, std::vector<int> fp2, double cancelThreshold) const
{
std::vector<int>::const_iterator it1, it2;
int offset;
int total_idx;
int remaining;
int threshold_bits;
int equal_bits, total_bits;
int i;
double best_result = -1;
int best_offset = -1;
bool ok;
for (offset=0; offset < m_maxoffset; ++offset)
{
// std::cout << "offset " << offset << "----------" << std::endl;
it1 = fp1.cbegin() + (m_maxoffset - offset);
it2 = fp2.cbegin() + m_maxoffset;
ok = true;
equal_bits = 0;
total_idx = std::min(fp1.size()-m_maxoffset+offset, fp2.size()-m_maxoffset);
// std::cout << "total_idx " << total_idx << std::endl;
remaining = total_bits = total_idx * 32;
threshold_bits = total_bits * cancelThreshold;
// std::cout << "threshold " << threshold_bits << std::endl;
int idx=0;
for (; it1!=fp1.end() && it2!=fp2.end() ; ++it1, ++it2, ++idx)
{
// std::cout << "(" << (idx-offset) << ") " << *it1 << " (" << idx << ") " << *it2 << std::endl;
equal_bits += 32 - __builtin_popcount(*it1 ^ *it2);
remaining -= 32;
// std::cout << "equal_bits " << equal_bits << " remaining " << remaining << " threshold_bits " << threshold_bits << std::endl;
if (equal_bits + remaining < threshold_bits)
{
ok = false;
// std::cout << "break" << *it2 << std::endl;
break;
}
}
if (ok)
{
double result = equal_bits/(double)total_bits;
if (result > best_result)
{
best_result = result;
best_offset = offset;
}
}
}
for (offset=1; offset < m_maxoffset; ++offset)
{
// std::cout << "offset " << offset << "----------" << std::endl;
it1 = fp1.cbegin() + m_maxoffset;
it2 = fp2.cbegin() + (m_maxoffset - offset);
ok = true;
equal_bits = 0;
total_idx = std::min(fp1.size()-m_maxoffset, fp2.size()-m_maxoffset+offset);
// std::cout << "total_idx " << total_idx << std::endl;
remaining = total_bits = total_idx * 32;
threshold_bits = total_bits * cancelThreshold;
// std::cout << "threshold " << threshold_bits << std::endl;
int idx = 0;
for (; it1!=fp1.end() && it2!=fp2.end() ; ++it1, ++it2, ++idx)
{
// std::cout << "(" << (idx) << ") " << *it1 << " (" << idx-offset << ") " << *it2 << std::endl;
equal_bits += 32 - __builtin_popcount(*it1 ^ *it2);
remaining -= 32;
// std::cout << "equal_bits " << equal_bits << " remaining " << remaining << " threshold_bits " << threshold_bits << std::endl;
if (equal_bits + remaining < threshold_bits)
{
ok = false;
// std::cout << "break" << *it2 << std::endl;
break;
}
}
if (ok)
{
double result = equal_bits/(double)total_bits;
if (result > best_result)
{
best_result = result;
best_offset = -offset;
}
}
}
return std::make_pair(best_offset, best_result);
}

long FingerprintManager::compareSongs(long songID1, long songID2, int maxoffset, double cancelThreshold)
{
return m_fingerprints[songID1][0];
}

BOOST_PYTHON_MODULE(bard_ext)
{
using namespace boost::python;
def("greet", greet<int>);
def("greet2", greet2);
class_<FingerprintManager>("FingerprintManager")
.def("addSong", &FingerprintManager::addSong)
.def("addSongAndCompare", &FingerprintManager::addSongAndCompare)
.def("compareSongs", &FingerprintManager::compareSongs)
.def("setMaxOffset", &FingerprintManager::setMaxOffset)
.def("maxOffset", &FingerprintManager::maxOffset);
}


@@ -1,4 +1,14 @@
from setuptools import setup
from setuptools import setup, Extension


module1 = Extension('bard_ext',
define_macros=[('MAJOR_VERSION', '1'),
('MINOR_VERSION', '0')],
include_dirs=['/usr/include/boost'],
libraries=['boost_python-py3'],
library_dirs=['/usr/lib'],
sources=['bard/bard_ext.cpp'],
extra_compile_args=['-std=c++1z'])

setup(
# Application name:
@@ -35,4 +45,5 @@
data_files=[('share/doc/packages/bard/', ['config/bard', 'README.md', 'LICENSE'])],
scripts=["scripts/bard"],
license="GPLv3",
ext_modules=[module1]
)

0 comments on commit 1e9efbe

Please sign in to comment.
You can’t perform that action at this time.