Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions matchcode-toolkit/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
Changelog
=========

v1.1.1
------

*2023-06-29* -- Do not include empty files when computing directory fingerprints.

v1.1.0
------

*2023-06-22* -- Rename ``compute_directory_fingerprints`` to ``compute_codebase_directory_fingerprints`` and create a new version of ``compute_directory_fingerprints`` that works on Resource objects instead of codebases.

v1.0.0
------

Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "matchcode-toolkit"
version = "1.0.0"
version = "1.1.1"

[build-system]
requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"]
Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = matchcode-toolkit
version = 1.0.0
version = 1.1.1
license = Apache-2.0

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
58 changes: 39 additions & 19 deletions matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,50 @@ def create_structure_fingerprint(directory, children):
return _create_directory_fingerprint(features)


def compute_directory_fingerprints(codebase):
def _compute_directory_fingerprints(directory, codebase):
"""
Compute fingerprints for a directory from `codebase`
Compute fingerprints for `directory` from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
children = [r for r in resource.walk(codebase) if r.is_file]
if len(children) == 1:
continue
# We do not want to add empty files to our fingerprint
children = [r for r in directory.walk(codebase) if r.is_file and r.size]
if len(children) == 1:
return

directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(resource, 'directory_content_fingerprint'):
resource.directory_content_fingerprint = directory_content_fingerprint
else:
resource.extra_data['directory_content'] = directory_content_fingerprint
directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(directory, 'directory_content_fingerprint'):
directory.directory_content_fingerprint = directory_content_fingerprint
else:
directory.extra_data['directory_content'] = directory_content_fingerprint

directory_structure_fingerprint = create_structure_fingerprint(resource, children)
if hasattr(resource, 'directory_structure_fingerprint'):
resource.directory_structure_fingerprint = directory_structure_fingerprint
else:
resource.extra_data['directory_structure'] = create_structure_fingerprint(resource, children)
directory_structure_fingerprint = create_structure_fingerprint(directory, children)
if hasattr(directory, 'directory_structure_fingerprint'):
directory.directory_structure_fingerprint = directory_structure_fingerprint
else:
directory.extra_data['directory_structure'] = directory_structure_fingerprint

directory.save(codebase)
return directory


def compute_directory_fingerprints(directory, codebase):
"""
Recursivly compute fingerprints for `directory` from `codebase`
"""
for resource in directory.walk(codebase, topdown=False):
if resource.is_file:
continue
_ = _compute_directory_fingerprints(resource, codebase)
return directory

resource.save(codebase)

def compute_codebase_directory_fingerprints(codebase):
"""
Compute fingerprints for directories from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
_ = _compute_directory_fingerprints(resource, codebase)
return codebase


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints

from scanpipe.pipelines.scan_package import ScanPackage
from scanpipe.pipes.codebase import ProjectCodebase
Expand Down Expand Up @@ -63,4 +63,4 @@ def fingerprint_codebase(self):
Compute directory fingerprints for matching purposes
"""
project_codebase = ProjectCodebase(self.project)
compute_directory_fingerprints(project_codebase)
compute_codebase_directory_fingerprints(project_codebase)
4 changes: 2 additions & 2 deletions matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import POST_SCAN_GROUP
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from plugincode.post_scan import post_scan_impl
from plugincode.post_scan import PostScanPlugin

Expand Down Expand Up @@ -41,4 +41,4 @@ def is_enabled(self, fingerprint, **kwargs):
return fingerprint

def process_codebase(self, codebase, **kwargs):
codebase = compute_directory_fingerprints(codebase)
codebase = compute_codebase_directory_fingerprints(codebase)
6 changes: 3 additions & 3 deletions matchcode-toolkit/tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
from matchcode_toolkit.fingerprinting import _get_resource_subpath
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import create_content_fingerprint
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
Expand Down Expand Up @@ -95,10 +95,10 @@ def test_create_halohash_chunks(self):
self.assertEqual(chunk3, expected_chunk3)
self.assertEqual(chunk4, expected_chunk4)

def test_compute_directory_fingerprints(self):
def test_compute_codebase_directory_fingerprints(self):
scan_loc = self.get_test_loc('abbrev-1.0.3-i.json')
vc = VirtualCodebase(location=scan_loc)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
directory_content = vc.root.extra_data['directory_content']
directory_structure = vc.root.extra_data['directory_structure']
expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b'
Expand Down
22 changes: 22 additions & 0 deletions matchcode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ class MultipleCharFilter(MultipleChoiceFilter):
field_class = MultipleCharField


# TODO: Think of a better name for this filter
class MultipleCharInFilter(MultipleCharFilter):
def filter(self, qs, value):
if not value:
# Even though not a noop, no point filtering if empty.
return qs

if self.is_noop(qs, value):
return qs

predicate = self.get_filter_predicate(value)
old_field_name = next(iter(predicate))
new_field_name = f'{old_field_name}__in'
predicate[new_field_name] = predicate[old_field_name]
predicate.pop(old_field_name)

q = Q(**predicate)
qs = self.get_method(qs)(q)

return qs.distinct() if self.distinct else qs


class MultipleSHA1Filter(MultipleCharFilter):
"""
Overrides `MultipleCharFilter.filter()` to convert the SHA1
Expand Down
4 changes: 2 additions & 2 deletions matchcode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.models import ApproximateDirectoryContentIndex
from matchcode.models import ApproximateDirectoryStructureIndex
from matchcode.models import ExactPackageArchiveIndex
Expand Down Expand Up @@ -150,5 +150,5 @@ def index_package_directories(package):
if not vc:
return 0, 0

vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
return index_directory_fingerprints(vc, package)
4 changes: 2 additions & 2 deletions matchcode/tests/test_index_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.indexing import _create_virtual_codebase_from_package_resources
from matchcode.indexing import index_directory_fingerprints
Expand Down Expand Up @@ -155,7 +155,7 @@ def test__create_virtual_codebase_from_package_resources(self):

def test_index_directory_fingerprints(self):
vc = _create_virtual_codebase_from_package_resources(self.test_package1)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)

# Ensure tables are empty prior to indexing
self.assertFalse(ApproximateDirectoryContentIndex.objects.all())
Expand Down
4 changes: 2 additions & 2 deletions matchcode/tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from commoncode.resource import VirtualCodebase
from packagedb.models import Package

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.match import EXACT_PACKAGE_ARCHIVE_MATCH
from matchcode.match import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH
Expand All @@ -37,7 +37,7 @@ def run_do_match_from_scan(scan_file_location, match_type):
matched_to=attr.ib(default=attr.Factory(list))
)
)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
do_match(vc, match_type)
return vc

Expand Down
6 changes: 3 additions & 3 deletions matchcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from packagedb.models import Package
import attr

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.models import ApproximateDirectoryContentIndex
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand All @@ -192,7 +192,7 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand Down
109 changes: 60 additions & 49 deletions minecode/management/commands/process_scans.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,58 +61,69 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa
if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS):
scannable_uri.scan_status = get_scan_status(scan_info)
elif scannable_uri.scan_status in (ScannableURI.SCAN_COMPLETED,):
logger.info('Indexing scanned files for URI: {}'.format(scannable_uri))

package = scannable_uri.package
scan_data = scanning.get_scan_data(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)
scan_index_errors = index_package_files(package, scan_data)
scan_index_errors = []
try:
logger.info('Indexing scanned files for URI: {}'.format(scannable_uri))

package = scannable_uri.package
input_size = scan_info.size
if input_size:
computed_timeout = ((input_size / 1000000) / 2) * 60
timeout = max(computed_timeout, scanning.REQUEST_TIMEOUT)
else:
timeout = scanning.REQUEST_TIMEOUT
scan_data = scanning.get_scan_data(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
timeout=timeout,
get_scan_data_save_loc=get_scan_data_save_loc
)
scan_index_errors.extend(index_package_files(package, scan_data))

summary = scanning.get_scan_summary(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)
summary = scanning.get_scan_summary(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)

other_license_expressions = summary.get('other_license_expressions', [])
other_license_expressions = [l['value'] for l in other_license_expressions if l['value']]
other_license_expression = combine_expressions(other_license_expressions)

copyright = ''
declared_holder = summary.get('declared_holder')
if declared_holder:
copyright = f'Copyright (c) {declared_holder}'

values_by_updateable_fields = {
'sha1': scan_info.sha1,
'sha256': scan_info.sha256,
'sha512': scan_info.sha512,
'summary': summary,
'declared_license_expression': summary.get('declared_license_expression'),
'other_license_expression': other_license_expression,
'copyright': copyright,
}

for field, value in values_by_updateable_fields.items():
p_val = getattr(package, field)
if not p_val and value:
setattr(package, field, value)
package_updated = True

if package_updated:
package.save()

other_license_expressions = summary.get('other_license_expressions', [])
other_license_expressions = [l['value'] for l in other_license_expressions]
other_license_expression = combine_expressions(other_license_expressions)

copyright = ''
declared_holder = summary.get('declared_holder')
if declared_holder:
copyright = f'Copyright (c) {declared_holder}'

values_by_updateable_fields = {
'sha1': scan_info.sha1,
'sha256': scan_info.sha256,
'sha512': scan_info.sha512,
'summary': summary,
'declared_license_expression': summary.get('declared_license_expression'),
'other_license_expression': other_license_expression,
'copyright': copyright,
}

for field, value in values_by_updateable_fields.items():
p_val = getattr(package, field)
if not p_val and value:
setattr(package, field, value)
package_updated = True

if package_updated:
package.save()

# TODO: We should rerun the specific indexers that have failed
if scan_index_errors:
scannable_uri.index_error = '\n'.join(scan_index_errors)
scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED
else:
scannable_uri.scan_status = ScannableURI.SCAN_INDEXED
except Exception as e:
error_message = str(e) + '\n'
# TODO: We should rerun the specific indexers that have failed
if scan_index_errors:
error_message += '\n'.join(scan_index_errors)
scannable_uri.index_error
scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED

scannable_uri.wip_date = None
scannable_uri.save()
Expand Down
Loading