From 3ed660994fe4718d948656b717e97b8d18473fe4 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 12 Apr 2024 12:06:19 -0700 Subject: [PATCH] Convert ngrams to bytes before combining #5 Signed-off-by: Jono Yang --- src/matchcode_toolkit/fingerprinting.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 2d5f1b3..b4cd05c 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -226,11 +226,18 @@ def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs): # break content into words, then create ngrams from words words = tokenizer(content) ngs = ngrams(words, ngram_length) + # We convert each list of ngrams to a sequence of bytes - ngs = [b''.join(ng).encode('utf-8') for ng in ngs] + ngs_bytes = [] + for ng in ngs: + ng_bytes = [] + for g in ng: + ng_bytes.append(g.encode('utf-8')) + ngs_bytes.append(ng_bytes) + ngs_bytes = [b''.join(ng) for ng in ngs_bytes] # Create fingerprints and return fingerprint hashes - file_fingerprint = BitAverageHaloHash(ngs) if ngs else None + file_fingerprint = BitAverageHaloHash(ngs_bytes) if ngs_bytes else None return dict( halo1=file_fingerprint.hexdigest().decode('utf-8') if file_fingerprint else ''