Skip to content

Commit

Permalink
Merge pull request #6 from nexB/5-approximate-file-matching
Browse files Browse the repository at this point in the history
Implement approximate file matching #5
  • Loading branch information
JonoYang committed Apr 16, 2024
2 parents 3fc99a2 + 04523cb commit 842777f
Show file tree
Hide file tree
Showing 7 changed files with 4,690 additions and 1 deletion.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

v4.1.0
------

*2024-04-15* -- Add new functions to compute fingerprints on text resources for approximate file matching (https://github.com/nexB/matchcode-toolkit/issues/5)

v4.0.0
------

Expand Down
77 changes: 77 additions & 0 deletions src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#

import binascii
import re

from matchcode_toolkit.halohash import BitAverageHaloHash

Expand Down Expand Up @@ -158,3 +159,79 @@ def create_halohash_chunks(bah128):
chunk4 = hexstring_to_binarray(chunk4)

return chunk1, chunk2, chunk3, chunk4


# Split on whitespace and punctuations: keep only characters and numbers
query_pattern = '[^_\\W]+'
word_splitter = re.compile(query_pattern, re.UNICODE).findall


def _tokenizer(text):
"""
Return an list of tokens from a unicode text.
"""
if not text:
return []
return [token for token in word_splitter(text) if token]


def tokenizer(text):
"""
Return an list of tokens from a unicode text.
For example::
>>> list(tokenizer(''))
[]
>>> x = list(tokenizer('some Text with spAces! + _ -'))
>>> assert x == ['some', 'text', 'with', 'spaces']
>>> x = list(tokenizer('{{}some }}Text with spAces! + _ -'))
>>> assert x == ['some', 'text', 'with', 'spaces']
>>> x = list(tokenizer('{{Hi}}some {{}}Text with{{noth+-_!@ing}} {{junk}}spAces! + _ -{{}}'))
>>> assert x == ['hi', 'some', 'text', 'with', 'noth', 'ing', 'junk', 'spaces']
"""
return _tokenizer(text.lower())


def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
"""
Return a mapping of fingerprint hashes for the file at `location`
The `halo1` hash is the hex digest of the fingerprint of the file.
`halo1` is empty if the file is empty.
- We start by breaking the file into words (tokens)
- We compute ngrams over the list of tokens
Return an empty mapping if `location` is not a text file
"""
from commoncode import filetype
from licensedcode.tokenize import ngrams
from typecode.contenttype import get_type

# Do not process `location` if it's not a text file
ft = get_type(location)
if not (filetype.is_file(location) and ft.is_text):
return {}

with open(location, encoding='utf-8') as f:
content = f.read()

# Break content into words, then create ngrams from words
words = tokenizer(content)
ngs = ngrams(words, ngram_length)

# Convert each list of ngrams to a sequence of bytes
ngs_bytes = [[g.encode('utf-8') for g in ng] for ng in ngs]

# Join all ngrams into a single bytearray
ngs_bytes = [b''.join(ng) for ng in ngs_bytes]

# Create fingerprints and return fingerprint hashes
file_fingerprint = BitAverageHaloHash(ngs_bytes) if ngs_bytes else None

return dict(
halo1=file_fingerprint.hexdigest().decode('utf-8') if file_fingerprint else ''
)
3 changes: 2 additions & 1 deletion src/matchcode_toolkit/pipelines/fingerprint_codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def steps(cls):

def fingerprint_codebase(self):
"""
Compute directory fingerprints for matching purposes
Compute directory and resource fingerprints for matching purposes
"""
matchcode.fingerprint_codebase_directories(self.project)
matchcode.fingerprint_codebase_resources(self.project)
28 changes: 28 additions & 0 deletions tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
from matchcode_toolkit.fingerprinting import create_content_fingerprint
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
from matchcode_toolkit.fingerprinting import split_fingerprint
from matchcode_toolkit.halohash import byte_hamming_distance


class Resource():
Expand Down Expand Up @@ -124,3 +126,29 @@ def test_do_not_compute_fingerprint_for_empty_dirs(self):
self.assertEqual({}, empty_dir_1.extra_data)
self.assertEqual({}, empty_dir_2.extra_data)
self.assertEqual({}, empty_dir_2.extra_data)

def test_get_file_fingerprint_hashes_one_line_removed(self):
test_file1 = self.get_test_loc('inflate.c')
test_file2 = self.get_test_loc('inflate-mod.c')
result1 = get_file_fingerprint_hashes(test_file1)
result2 = get_file_fingerprint_hashes(test_file2)
result1 = result1.get('halo1')
result2 = result2.get('halo1')
expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
expected_result2 = 'aa3a49e4cd40718d1297be519e6564a4'
assert result1 == expected_result1
assert result2 == expected_result2
assert byte_hamming_distance(result1, result2) == 2

def test_get_file_fingerprint_hashes_one_line_added(self):
test_file1 = self.get_test_loc('inflate.c')
test_file2 = self.get_test_loc('inflate-mod2.c')
result1 = get_file_fingerprint_hashes(test_file1)
result2 = get_file_fingerprint_hashes(test_file2)
result1 = result1.get('halo1')
result2 = result2.get('halo1')
expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
expected_result2 = 'a23b49e4cd40708d1297be719c6564a4'
assert result1 == expected_result1
assert result2 == expected_result2
assert byte_hamming_distance(result1, result2) == 3
Loading

0 comments on commit 842777f

Please sign in to comment.