Skip to content

Commit

Permalink
Use code files for testing #5
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <jyang@nexb.com>
  • Loading branch information
JonoYang committed Apr 16, 2024
1 parent a9316c2 commit 04523cb
Show file tree
Hide file tree
Showing 7 changed files with 4,606 additions and 17 deletions.
8 changes: 4 additions & 4 deletions src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
import binascii
import re

from commoncode import filetype
from licensedcode.tokenize import ngrams
from typecode.contenttype import get_type

from matchcode_toolkit.halohash import BitAverageHaloHash


Expand Down Expand Up @@ -211,6 +207,10 @@ def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
Return an empty mapping if `location` is not a text file
"""
from commoncode import filetype
from licensedcode.tokenize import ngrams
from typecode.contenttype import get_type

# Do not process `location` if it's not a text file
ft = get_type(location)
if not (filetype.is_file(location) and ft.is_text):
Expand Down
35 changes: 24 additions & 11 deletions tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
from matchcode_toolkit.fingerprinting import split_fingerprint
from matchcode_toolkit.halohash import byte_hamming_distance


class Resource():
Expand Down Expand Up @@ -126,16 +127,28 @@ def test_do_not_compute_fingerprint_for_empty_dirs(self):
self.assertEqual({}, empty_dir_2.extra_data)
self.assertEqual({}, empty_dir_2.extra_data)

def test_get_file_fingerprint_hashes_same_halo1_with_exact_repetition(self):
"""
The test files contain the string 'z1 z2 z3 z4 z5 z6 z7 z8' repeated
some number of times.
In the case of test_file1, the string is repeated twice in a row. In the
case of test_file2, the string is repeated 8 times.
"""
test_file1 = self.get_test_loc('16words')
test_file2 = self.get_test_loc('64words')
def test_get_file_fingerprint_hashes_one_line_removed(self):
test_file1 = self.get_test_loc('inflate.c')
test_file2 = self.get_test_loc('inflate-mod.c')
result1 = get_file_fingerprint_hashes(test_file1)
result2 = get_file_fingerprint_hashes(test_file2)
result1 = result1.get('halo1')
result2 = result2.get('halo1')
expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
expected_result2 = 'aa3a49e4cd40718d1297be519e6564a4'
assert result1 == expected_result1
assert result2 == expected_result2
assert byte_hamming_distance(result1, result2) == 2

def test_get_file_fingerprint_hashes_one_line_added(self):
test_file1 = self.get_test_loc('inflate.c')
test_file2 = self.get_test_loc('inflate-mod2.c')
result1 = get_file_fingerprint_hashes(test_file1)
result2 = get_file_fingerprint_hashes(test_file2)
assert result1.get('halo1') == result2.get('halo1')
result1 = result1.get('halo1')
result2 = result2.get('halo1')
expected_result1 = 'a23a49e4cd40718d1297be719e6564a4'
expected_result2 = 'a23b49e4cd40708d1297be719c6564a4'
assert result1 == expected_result1
assert result2 == expected_result2
assert byte_hamming_distance(result1, result2) == 3
1 change: 0 additions & 1 deletion tests/testfiles/fingerprinting/16words

This file was deleted.

1 change: 0 additions & 1 deletion tests/testfiles/fingerprinting/64words

This file was deleted.

Loading

0 comments on commit 04523cb

Please sign in to comment.