Merge pull request #6 from nexB/5-approximate-file-matching

Implement approximate file matching #5
aboutcode-org · Apr 16, 2024 · 842777f · 842777f
2 parents 3fc99a2 + 04523cb
commit 842777f
Show file tree

Hide file tree

Showing 7 changed files with 4,690 additions and 1 deletion.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+v4.1.0
+------
+
+*2024-04-15* -- Add new functions to compute fingerprints on text resources for approximate file matching (https://github.com/nexB/matchcode-toolkit/issues/5)
+
 v4.0.0
 ------
 

diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
@@ -8,6 +8,7 @@
 #
 
 import binascii
+import re
 
 from matchcode_toolkit.halohash import BitAverageHaloHash
 
@@ -158,3 +159,79 @@ def create_halohash_chunks(bah128):
     chunk4 = hexstring_to_binarray(chunk4)
 
     return chunk1, chunk2, chunk3, chunk4
+
+
+# Split on whitespace and punctuations: keep only characters and numbers
+query_pattern = '[^_\\W]+'
+word_splitter = re.compile(query_pattern, re.UNICODE).findall
+
+
+def _tokenizer(text):
+    """
+    Return an list of tokens from a unicode text.
+    """
+    if not text:
+        return []
+    return [token for token in word_splitter(text) if token]
+
+
+def tokenizer(text):
+    """
+    Return an list of tokens from a unicode text.
+
+    For example::
+    >>> list(tokenizer(''))
+    []
+    >>> x = list(tokenizer('some Text with   spAces! + _ -'))
+    >>> assert x == ['some', 'text', 'with', 'spaces']
+
+    >>> x = list(tokenizer('{{}some }}Text with   spAces! + _ -'))
+    >>> assert x == ['some', 'text', 'with', 'spaces']
+
+    >>> x = list(tokenizer('{{Hi}}some {{}}Text with{{noth+-_!@ing}}   {{junk}}spAces! + _ -{{}}'))
+    >>> assert x == ['hi', 'some', 'text', 'with', 'noth', 'ing', 'junk', 'spaces']
+
+    """
+    return _tokenizer(text.lower())
+
+
+def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
+    """
+    Return a mapping of fingerprint hashes for the file at `location`
+
+    The `halo1` hash is the hex digest of the fingerprint of the file.
+    `halo1` is empty if the file is empty.
+
+    - We start by breaking the file into words (tokens)
+    - We compute ngrams over the list of tokens
+
+    Return an empty mapping if `location` is not a text file
+    """
+    from commoncode import filetype
+    from licensedcode.tokenize import ngrams
+    from typecode.contenttype import get_type
+
+    # Do not process `location` if it's not a text file
+    ft = get_type(location)
+    if not (filetype.is_file(location) and ft.is_text):
+        return {}
+
+    with open(location, encoding='utf-8') as f:
+        content = f.read()
+
+    # Break content into words, then create ngrams from words
+    words = tokenizer(content)
+    ngs = ngrams(words, ngram_length)
+
+    # Convert each list of ngrams to a sequence of bytes
+    ngs_bytes = [[g.encode('utf-8') for g in ng] for ng in ngs]
+
+    # Join all ngrams into a single bytearray
+    ngs_bytes = [b''.join(ng) for ng in ngs_bytes]
+
+    # Create fingerprints and return fingerprint hashes
+    file_fingerprint = BitAverageHaloHash(ngs_bytes) if ngs_bytes else None
+
+    return dict(
+        halo1=file_fingerprint.hexdigest().decode('utf-8') if file_fingerprint else ''
+    )
diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py
@@ -38,6 +38,7 @@ def steps(cls):
 
     def fingerprint_codebase(self):
         """
-        Compute directory fingerprints for matching purposes
+        Compute directory and resource fingerprints for matching purposes
         """
         matchcode.fingerprint_codebase_directories(self.project)
+        matchcode.fingerprint_codebase_resources(self.project)
diff --git a/tests/test_fingerprinting.py b/tests/test_fingerprinting.py
@@ -18,7 +18,9 @@
 from matchcode_toolkit.fingerprinting import create_content_fingerprint
 from matchcode_toolkit.fingerprinting import create_halohash_chunks
 from matchcode_toolkit.fingerprinting import create_structure_fingerprint
+from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
 from matchcode_toolkit.fingerprinting import split_fingerprint
+from matchcode_toolkit.halohash import byte_hamming_distance
 
 
 class Resource():
@@ -124,3 +126,29 @@ def test_do_not_compute_fingerprint_for_empty_dirs(self):
         self.assertEqual({}, empty_dir_1.extra_data)
         self.assertEqual({}, empty_dir_2.extra_data)
         self.assertEqual({}, empty_dir_2.extra_data)
+
+    def test_get_file_fingerprint_hashes_one_line_removed(self):
+        test_file1 = self.get_test_loc('inflate.c')
+        test_file2 = self.get_test_loc('inflate-mod.c')
+        result1 = get_file_fingerprint_hashes(test_file1)
+        result2 = get_file_fingerprint_hashes(test_file2)
+        result1 = result1.get('halo1')
+        result2 = result2.get('halo1')
+        expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
+        expected_result2 = 'aa3a49e4cd40718d1297be519e6564a4'
+        assert result1 == expected_result1
+        assert result2 == expected_result2
+        assert byte_hamming_distance(result1, result2) == 2
+
+    def test_get_file_fingerprint_hashes_one_line_added(self):
+        test_file1 = self.get_test_loc('inflate.c')
+        test_file2 = self.get_test_loc('inflate-mod2.c')
+        result1 = get_file_fingerprint_hashes(test_file1)
+        result2 = get_file_fingerprint_hashes(test_file2)
+        result1 = result1.get('halo1')
+        result2 = result2.get('halo1')
+        expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
+        expected_result2 = 'a23b49e4cd40708d1297be719c6564a4'
+        assert result1 == expected_result1
+        assert result2 == expected_result2
+        assert byte_hamming_distance(result1, result2) == 3