Skip to content

Commit

Permalink
~2x speed increase on findAudioDuplicates
Browse files Browse the repository at this point in the history
Speed increased around 2x by using gmpy.popcount and better data handling

Also, added some test code to compareSongs that runs the comparison
twice to test both results (by previous and current core) are
equivalent.

The test set is a collection of 4759 songs and the results are:
Before this commit:

>>> p.print_stats()
Fri Aug  4 10:17:41 2017    profile_bard_idx

         301527142 function calls (301522248 primitive calls) in 215.097 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    330/1    0.007    0.000  215.097  215.097 {built-in method builtins.exec}
        1    0.000    0.000  215.097  215.097 /usr/bin/bard:2(<module>)
        1    0.000    0.000  214.944  214.944 /home/antonio/git/antlarr/bard/bard/bard.py:777(main)
        1    0.053    0.053  214.943  214.943 /home/antonio/git/antlarr/bard/bard/bard.py:607(parseCommandLine)
        1    0.059    0.059  214.885  214.885 /home/antonio/git/antlarr/bard/bard/bard.py:462(findAudioDuplicates)
     3578    0.162    0.000  213.436    0.060 /home/antonio/git/antlarr/bard/bard/bard.py:58(compareChromaprintFingerprintsAndOffset)
   354222  102.851    0.000  213.171    0.001 /home/antonio/git/antlarr/bard/bard/bard.py:35(compareChromaprintFingerprints)
299543702  110.207    0.000  110.207    0.000 /home/antonio/git/antlarr/bard/bard/bard.py:18(bitsoncount)
       54    0.650    0.012    0.650    0.012 {method 'execute' of 'sqlite3.Cursor' objects}

After this commit:

>>> p.print_stats()
Fri Aug  4 18:17:44 2017    profile_bard_gmpy

         301564162 function calls (301557572 primitive calls) in 127.292 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    424/1    0.007    0.000  127.292  127.292 {built-in method builtins.exec}
        1    0.000    0.000  127.292  127.292 /usr/bin/bard:2(<module>)
        1    0.000    0.000  127.099  127.099 /home/antonio/git/antlarr/bard/bard/bard.py:847(main)
        1    0.128    0.128  127.099  127.099 /home/antonio/git/antlarr/bard/bard/bard.py:677(parseCommandLine)
        1    0.084    0.084  126.966  126.966 /home/antonio/git/antlarr/bard/bard/bard.py:509(findAudioDuplicates)
     3578    0.300    0.000  123.672    0.035 /home/antonio/git/antlarr/bard/bard/bard.py:105(compareChromaprintFingerprintsAndOffset2)
   354222   98.899    0.000  123.290    0.000 /home/antonio/git/antlarr/bard/bard/bard.py:38(compareChromaprintFingerprints)
299543702   24.271    0.000   24.271    0.000 {built-in method gmpy.popcount}
     3616    1.857    0.001    1.857    0.001 /home/antonio/git/antlarr/bard/bard/bard.py:526(<listcomp>)
       54    0.647    0.012    0.647    0.012 {method 'execute' of 'sqlite3.Cursor' objects}
  • Loading branch information
antlarr committed Aug 8, 2017
1 parent dce5716 commit 8dffcdf
Showing 1 changed file with 101 additions and 28 deletions.
129 changes: 101 additions & 28 deletions bard/bard.py
Expand Up @@ -8,6 +8,9 @@
import sys import sys
import os import os
import re import re
import ctypes
import numpy
from gmpy import popcount
import mutagen import mutagen
import argparse import argparse
import subprocess import subprocess
Expand All @@ -32,27 +35,74 @@ def compareBits(x, y):
return same_bits return same_bits




def compareChromaprintFingerprints(a, b, threshold=0.9, cancelThreshold=0.55): def compareChromaprintFingerprints(a, b, threshold=0.9, cancelThreshold=0.55, offset=None):
equal_bits = 0 equal_bits = 0
total_bits = 32.0 * min(len(a[0]), len(b[0])) total_idx = min(len(a[0]), len(b[0]))
total_bits = 32.0 * total_idx
remaining = total_bits remaining = total_bits
if cancelThreshold > threshold:
cancelThreshold = threshold
thresholdBits = int(total_bits * cancelThreshold) thresholdBits = int(total_bits * cancelThreshold)
for x, y in zip(a[0], b[0]): for i, (x, y) in enumerate(zip(a[0], b[0])):
# equal_bits += 32 - bin(x ^ y).count('1') #print('old', offset, i, x, y)
equal_bits += 32 - bitsoncount(x ^ y) equal_bits += 32 - popcount(x.value ^ y.value)
#print(equal_bits)
remaining -= 32 remaining -= 32
if remaining and equal_bits + remaining < thresholdBits: if equal_bits + remaining < thresholdBits:
return None return -1
# if equal_bits / total_bits < threshold:
# return equal_bits / total_bits
# print(equal_bits, total_bits, equal_bits / total_bits, threshold)
return equal_bits / total_bits return equal_bits / total_bits
# return equal_bits / total_bits >= threshold




def compareChromaprintFingerprintsAndOffset(a, b, maxoffset=60, debug=False): def compareChromaprintFingerprintsAndOffset(a, b, maxoffset=50, debug=False):
if not a[0] or not b[0]:
return (None, None)

cancelThreshold = 0.55
equal_bits = [0] * (2 * maxoffset)
result = equal_bits[:]
total_idx = ([min(len(a[0]) - maxoffset + idx,
len(b[0]) - maxoffset) for idx in range(maxoffset)] +
list(reversed([min(len(a[0]) - maxoffset,
len(b[0]) - maxoffset + idx)
for idx in range(1, maxoffset)])))
total_bits = [32.0 * x for x in total_idx]
remaining = total_bits[:]
thresholdBits = [int(x * cancelThreshold) for x in total_bits]
for offset in range(0, maxoffset):
remaining = total_bits[offset]
for i in range(total_idx[offset]):
# x = a[0][i - offset]
# y = b[0][i]
equal_bits[offset] += 32 - popcount(a[0][i-offset].value ^ b[0][i].value)
remaining -= 32
if equal_bits[offset] + remaining < thresholdBits[offset]:
result[offset] = -1
break
else:
result[offset] = equal_bits[offset] / total_bits[offset]
# print('new',offset, result[offset])

for offset in reversed(range(-maxoffset + 1, 0)):
remaining = total_bits[offset]
for i in range(total_idx[offset]):
# x = a[0][i]
# y = b[0][i + offset]
# print('new', offset, i, x, y)
equal_bits[offset] += 32 - popcount(a[0][i].value ^ b[0][i+offset].value)
remaining -= 32
if equal_bits[offset] + remaining < thresholdBits[offset]:
result[offset] = -1
break
else:
result[offset] = equal_bits[offset] / total_bits[offset]
# print('new',offset, result[offset])

max_idx = numpy.argmax(result)
max_val = result[max_idx]
if max_idx > maxoffset:
max_idx = -(maxoffset * 2 - max_idx)
return (max_idx, max_val)


def compareChromaprintFingerprintsAndOffset2(a, b, maxoffset=50, debug=False):
if not a[0] or not b[0]: if not a[0] or not b[0]:
return (None, None) return (None, None)
tmp = (a[0][:], a[1]) tmp = (a[0][:], a[1])
Expand All @@ -61,26 +111,24 @@ def compareChromaprintFingerprintsAndOffset(a, b, maxoffset=60, debug=False):
if debug: if debug:
print(0, result) print(0, result)
for i in range(1, maxoffset): for i in range(1, maxoffset):
tmp[0].insert(0, 0) tmp[0].insert(0, ctypes.c_uint32(0))
r = compareChromaprintFingerprints(tmp, b) r = compareChromaprintFingerprints(tmp, b)
if debug: if debug:
print(i, r) print(i, r)
if not r: if r > result:
continue
if not result or r > result:
result = r result = r
result_offset = i result_offset = i
tmp = (b[0][:], b[1]) tmp = (b[0][:], b[1])
for i in range(1, maxoffset): for i in range(1, maxoffset):
tmp[0].insert(0, 0) tmp[0].insert(0, ctypes.c_uint32(0))
r = compareChromaprintFingerprints(a, tmp) r = compareChromaprintFingerprints(a, tmp)
if debug: if debug:
print(-i, r) print(-i, r)
if not r: if r > result:
continue
if not result or r > result:
result = r result = r
result_offset = -i result_offset = -i
if result < 0:
result = None
return (result_offset, result) return (result_offset, result)




Expand Down Expand Up @@ -255,7 +303,7 @@ def list(self, path, long_ls=False):
songs = self.getSongs(path=path) songs = self.getSongs(path=path)
for song in songs: for song in songs:
if long_ls: if long_ls:
command = ['ls','-l', song.path()] command = ['ls', '-l', song.path()]
subprocess.run(command) subprocess.run(command)
else: else:
print("%s" % song.path()) print("%s" % song.path())
Expand Down Expand Up @@ -449,15 +497,18 @@ def findAudioDuplicates(self, from_song_id=0):
decodedFPs = {} decodedFPs = {}
matchThreshold = 0.8 matchThreshold = 0.8
storeThreshold = 0.55 storeThreshold = 0.55
maxoffset = 50
sql = ('SELECT id, fingerprint, sha256sum, audio_sha256sum, path, ' sql = ('SELECT id, fingerprint, sha256sum, audio_sha256sum, path, '
'completeness FROM fingerprints, songs, checksums, ' 'completeness FROM fingerprints, songs, checksums, '
'properties where songs.id=fingerprints.song_id and ' 'properties where songs.id=fingerprints.song_id and '
'songs.id = checksums.song_id and ' 'songs.id = checksums.song_id and '
'songs.id = properties.song_id order by id') 'songs.id = properties.song_id order by id')
for (songID, fingerprint, sha256sum, audioSha256sum, path, for (songID, fingerprint, sha256sum, audioSha256sum, path,
completeness) in c.execute(sql): completeness) in c.execute(sql):
# print('.', end='', flush=True) # print('.', songID, end='', flush=True)
dfp = chromaprint.decode_fingerprint(fingerprint) dfp = chromaprint.decode_fingerprint(fingerprint)
# dfp = ([ctypes.c_uint32(x) for x in dfp[0]], dfp[1])
dfp = ([ctypes.c_uint32(x) for x in dfp[0]] + [ctypes.c_uint32(0)] * maxoffset, dfp[1])
if not dfp[0]: if not dfp[0]:
print("Error calculating fingerprint of song %s (%s)" % print("Error calculating fingerprint of song %s (%s)" %
(songID, path)) (songID, path))
Expand All @@ -467,6 +518,8 @@ def findAudioDuplicates(self, from_song_id=0):
decodedFPs[fingerprint] = dfp decodedFPs[fingerprint] = dfp
info[songID] = (sha256sum, audioSha256sum, path, completeness) info[songID] = (sha256sum, audioSha256sum, path, completeness)
continue continue
if songID > from_song_id:
return


for fp, otherSongID in fingerprints.items(): for fp, otherSongID in fingerprints.items():
offset, similarity = \ offset, similarity = \
Expand Down Expand Up @@ -527,19 +580,39 @@ def getSongsFromIDorPath(self, id_or_path):
def compareSongs(self, song1, song2): def compareSongs(self, song1, song2):
matchThreshold = 0.8 matchThreshold = 0.8
storeThreshold = 0.55 storeThreshold = 0.55
maxoffset = 50
dfp1 = chromaprint.decode_fingerprint(song1.getAcoustidFingerprint()) dfp1 = chromaprint.decode_fingerprint(song1.getAcoustidFingerprint())
dfp1 = ([ctypes.c_uint32(x) for x in dfp1[0]] + [ctypes.c_uint32(0)] * maxoffset,
dfp1[1])
dfp2 = chromaprint.decode_fingerprint(song2.getAcoustidFingerprint()) dfp2 = chromaprint.decode_fingerprint(song2.getAcoustidFingerprint())
dfp2 = ([ctypes.c_uint32(x) for x in dfp2[0]] + [ctypes.c_uint32(0)] * maxoffset,
dfp2[1])
(offset, similarity) = compareChromaprintFingerprintsAndOffset(dfp1, (offset, similarity) = compareChromaprintFingerprintsAndOffset(dfp1,
dfp2, dfp2,
120, maxoffset,
True) True)
if similarity and similarity >= storeThreshold \ if similarity and similarity >= storeThreshold \
and song1.id and song2.id: and song1.id and song2.id:
print('******** %d %d %d %f' % (song1.id, song2.id, print('******** %d %d %d %f' % (song1.id, song2.id,
offset, similarity)) offset, similarity))
MusicDatabase.addSongsSimilarity(song1.id, song2.id, # MusicDatabase.addSongsSimilarity(song1.id, song2.id,
offset, similarity) # offset, similarity)
MusicDatabase.commit() # MusicDatabase.commit()

dfp1 = chromaprint.decode_fingerprint(song1.getAcoustidFingerprint())
dfp1 = ([ctypes.c_uint32(x) for x in dfp1[0]], dfp1[1])
dfp2 = chromaprint.decode_fingerprint(song2.getAcoustidFingerprint())
dfp2 = ([ctypes.c_uint32(x) for x in dfp2[0]], dfp2[1])
(offset, similarity) = compareChromaprintFingerprintsAndOffset2(dfp1,
dfp2,
maxoffset,
True)

if similarity and similarity >= storeThreshold \
and song1.id and song2.id:
print('******** %d %d %d %f' % (song1.id, song2.id,
offset, similarity))
return


if similarity and similarity >= matchThreshold: if similarity and similarity >= matchThreshold:
if song1.fileSha256sum() == song2.fileSha256sum(): if song1.fileSha256sum() == song2.fileSha256sum():
Expand Down

0 comments on commit 8dffcdf

Please sign in to comment.