From f16d3097cc7f83c45902bddfbe724976e3918d15 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Jun 2023 13:19:39 -0700 Subject: [PATCH 01/15] Update ResourceViewSet queryset * Defer most package fields when getting a Resource * Remove ordering on Resources * Prefetch dependencies and parties for Packages Signed-off-by: Jono Yang --- packagedb/api.py | 47 +++++++++++++++++++++++++++++++++++++++++++-- packagedb/models.py | 1 - purldb/urls.py | 22 ++++++++++----------- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/packagedb/api.py b/packagedb/api.py index 626517c5..01292ea8 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -78,7 +78,50 @@ class ResourceFilter(FilterSet): class ResourceViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Resource.objects.prefetch_related('package') + queryset = Resource.objects.select_related('package').defer( + 'package__history', + 'package__md5', + 'package__sha1', + 'package__sha256', + 'package__sha512', + 'package__extra_data', + 'package__filename', + 'package__primary_language', + 'package__description', + 'package__release_date', + 'package__homepage_url', + 'package__download_url', + 'package__size', + 'package__bug_tracking_url', + 'package__code_view_url', + 'package__vcs_url', + 'package__repository_homepage_url', + 'package__repository_download_url', + 'package__api_data_url', + 'package__copyright', + 'package__holder', + 'package__declared_license_expression', + 'package__declared_license_expression_spdx', + 'package__license_detections', + 'package__other_license_expression', + 'package__other_license_expression_spdx', + 'package__other_license_detections', + 'package__extracted_license_statement', + 'package__notice_text', + 'package__datasource_id', + 'package__file_references', + 'package__last_modified_date', + 'package__mining_level', + 'package__keywords', + 'package__root_path', + 'package__source_packages', + 'package__last_indexed_date', + 'package__index_error', + 'package__package_set', + 'package__package_content', + 'package__summary', + 'package__search_vector', + ) serializer_class = ResourceAPISerializer filterset_class = ResourceFilter lookup_field = 'sha1' @@ -182,7 +225,7 @@ class Meta: class PackageViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Package.objects.all() + queryset = Package.objects.prefetch_related('dependencies', 'parties') serializer_class = PackageAPISerializer lookup_field = 'uuid' filterset_class = PackageFilter diff --git a/packagedb/models.py b/packagedb/models.py index 16ff482e..16fb6c75 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -909,7 +909,6 @@ class Meta: unique_together = ( ('package', 'path'), ) - ordering = ('package', 'path') indexes = [ models.Index(fields=['md5']), models.Index(fields=['sha1']), diff --git a/purldb/urls.py b/purldb/urls.py index da126ed3..590be7f1 100644 --- a/purldb/urls.py +++ b/purldb/urls.py @@ -8,7 +8,7 @@ # from django.conf.urls import include -from django.urls import re_path +from django.urls import path from django.views.generic import RedirectView from rest_framework import routers @@ -23,16 +23,16 @@ api_router = routers.DefaultRouter() -api_router.register(r'packages', PackageViewSet) -api_router.register(r'resources', ResourceViewSet) -api_router.register(r'approximate_directory_content_index', ApproximateDirectoryContentIndexViewSet) -api_router.register(r'approximate_directory_structure_index', ApproximateDirectoryStructureIndexViewSet) -api_router.register(r'exact_file_index', ExactFileIndexViewSet) -api_router.register(r'exact_package_archive_index', ExactPackageArchiveIndexViewSet) -api_router.register(r'cditems', CDitemViewSet, 'cditems') -api_router.register(r'on_demand_queue', PriorityResourceURIViewSet) +api_router.register('packages', PackageViewSet) +api_router.register('resources', ResourceViewSet) +api_router.register('approximate_directory_content_index', ApproximateDirectoryContentIndexViewSet) +api_router.register('approximate_directory_structure_index', ApproximateDirectoryStructureIndexViewSet) +api_router.register('exact_file_index', ExactFileIndexViewSet) +api_router.register('exact_package_archive_index', ExactPackageArchiveIndexViewSet) +api_router.register('cditems', CDitemViewSet, 'cditems') +api_router.register('on_demand_queue', PriorityResourceURIViewSet) urlpatterns = [ - re_path(r'^api/', include((api_router.urls, 'api'))), - re_path("", RedirectView.as_view(url="api/")), + path('api/', include((api_router.urls, 'api'))), + path("", RedirectView.as_view(url="api/")), ] From ebc8d4f8aedd8286420194d39bb480cf806e40aa Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Jun 2023 13:28:40 -0700 Subject: [PATCH 02/15] Avoid adding None to license expression list Signed-off-by: Jono Yang --- minecode/management/commands/process_scans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minecode/management/commands/process_scans.py b/minecode/management/commands/process_scans.py index 0764b63a..28a7f206 100644 --- a/minecode/management/commands/process_scans.py +++ b/minecode/management/commands/process_scans.py @@ -80,7 +80,7 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa ) other_license_expressions = summary.get('other_license_expressions', []) - other_license_expressions = [l['value'] for l in other_license_expressions] + other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] other_license_expression = combine_expressions(other_license_expressions) copyright = '' From 6473e28f72c6729a96648268a1f76b532823c3e6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 13 Jun 2023 18:26:53 -0700 Subject: [PATCH 03/15] Create MultipleCharInFilter Signed-off-by: Jono Yang --- matchcode/api.py | 22 ++++++++++++++++++++++ packagedb/api.py | 10 +++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index f20dd166..092933e3 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -129,6 +129,28 @@ class MultipleCharFilter(MultipleChoiceFilter): field_class = MultipleCharField +# TODO: Think of a better name for this filter +class MultipleCharInFilter(MultipleCharFilter): + def filter(self, qs, value): + if not value: + # Even though not a noop, no point filtering if empty. + return qs + + if self.is_noop(qs, value): + return qs + + predicate = self.get_filter_predicate(value) + old_field_name = next(iter(predicate)) + new_field_name = f'{old_field_name}__in' + predicate[new_field_name] = predicate[old_field_name] + predicate.pop(old_field_name) + + q = Q(**predicate) + qs = self.get_method(qs)(q) + + return qs.distinct() if self.distinct else qs + + class MultipleSHA1Filter(MultipleCharFilter): """ Overrides `MultipleCharFilter.filter()` to convert the SHA1 diff --git a/packagedb/api.py b/packagedb/api.py index 01292ea8..48a0c278 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -14,7 +14,6 @@ from django_filters.filters import OrderingFilter import django_filters -from packagedcode.models import PackageData from packageurl import PackageURL from packageurl.contrib.django.utils import purl_to_lookups from rest_framework import status @@ -23,6 +22,7 @@ from rest_framework.response import Response from matchcode.api import MultipleCharFilter +from matchcode.api import MultipleCharInFilter # UnusedImport here! # But importing the mappers and visitors module triggers routes registration from minecode import visitors # NOQA @@ -69,10 +69,10 @@ def filter(self, qs, value): class ResourceFilter(FilterSet): package = PackageResourceUUIDFilter(label='Package UUID') purl = PackageResourcePurlFilter(label='Package pURL') - md5 = MultipleCharFilter( + md5 = MultipleCharInFilter( help_text="Exact MD5. Multi-value supported.", ) - sha1 = MultipleCharFilter( + sha1 = MultipleCharInFilter( help_text="Exact SHA1. Multi-value supported.", ) @@ -183,10 +183,10 @@ class PackageFilter(FilterSet): version = MultipleCharFilter( help_text="Exact version. Multi-value supported.", ) - md5 = MultipleCharFilter( + md5 = MultipleCharInFilter( help_text="Exact MD5. Multi-value supported.", ) - sha1 = MultipleCharFilter( + sha1 = MultipleCharInFilter( help_text="Exact SHA1. Multi-value supported.", ) purl = MultiplePackageURLFilter(label='Package URL') From 0c7f801a13e4d4157a0124bed73b5652988d7bdd Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 15 Jun 2023 18:06:16 -0700 Subject: [PATCH 04/15] Handle indexing exceptions * We want to make sure that ScannableURIs statuses are properly update when indexing fails Signed-off-by: Jono Yang --- minecode/management/commands/process_scans.py | 101 +++++++++--------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/minecode/management/commands/process_scans.py b/minecode/management/commands/process_scans.py index 28a7f206..95d25e51 100644 --- a/minecode/management/commands/process_scans.py +++ b/minecode/management/commands/process_scans.py @@ -61,58 +61,61 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): scannable_uri.scan_status = get_scan_status(scan_info) elif scannable_uri.scan_status in (ScannableURI.SCAN_COMPLETED,): - logger.info('Indexing scanned files for URI: {}'.format(scannable_uri)) - - package = scannable_uri.package - scan_data = scanning.get_scan_data( - scannable_uri.scan_uuid, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - get_scan_data_save_loc=get_scan_data_save_loc - ) - scan_index_errors = index_package_files(package, scan_data) + try: + logger.info('Indexing scanned files for URI: {}'.format(scannable_uri)) + + package = scannable_uri.package + scan_data = scanning.get_scan_data( + scannable_uri.scan_uuid, + api_url=cls.api_url, + api_auth_headers=cls.api_auth_headers, + get_scan_data_save_loc=get_scan_data_save_loc + ) + scan_index_errors = index_package_files(package, scan_data) - summary = scanning.get_scan_summary( - scannable_uri.scan_uuid, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - get_scan_data_save_loc=get_scan_data_save_loc - ) + summary = scanning.get_scan_summary( + scannable_uri.scan_uuid, + api_url=cls.api_url, + api_auth_headers=cls.api_auth_headers, + get_scan_data_save_loc=get_scan_data_save_loc + ) + + other_license_expressions = summary.get('other_license_expressions', []) + other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] + other_license_expression = combine_expressions(other_license_expressions) + + copyright = '' + declared_holder = summary.get('declared_holder') + if declared_holder: + copyright = f'Copyright (c) {declared_holder}' + + values_by_updateable_fields = { + 'sha1': scan_info.sha1, + 'sha256': scan_info.sha256, + 'sha512': scan_info.sha512, + 'summary': summary, + 'declared_license_expression': summary.get('declared_license_expression'), + 'other_license_expression': other_license_expression, + 'copyright': copyright, + } + + for field, value in values_by_updateable_fields.items(): + p_val = getattr(package, field) + if not p_val and value: + setattr(package, field, value) + package_updated = True + + if package_updated: + package.save() - other_license_expressions = summary.get('other_license_expressions', []) - other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] - other_license_expression = combine_expressions(other_license_expressions) - - copyright = '' - declared_holder = summary.get('declared_holder') - if declared_holder: - copyright = f'Copyright (c) {declared_holder}' - - values_by_updateable_fields = { - 'sha1': scan_info.sha1, - 'sha256': scan_info.sha256, - 'sha512': scan_info.sha512, - 'summary': summary, - 'declared_license_expression': summary.get('declared_license_expression'), - 'other_license_expression': other_license_expression, - 'copyright': copyright, - } - - for field, value in values_by_updateable_fields.items(): - p_val = getattr(package, field) - if not p_val and value: - setattr(package, field, value) - package_updated = True - - if package_updated: - package.save() - - # TODO: We should rerun the specific indexers that have failed - if scan_index_errors: - scannable_uri.index_error = '\n'.join(scan_index_errors) - scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED - else: scannable_uri.scan_status = ScannableURI.SCAN_INDEXED + except Exception as e: + error_message = str(e) + '\n' + # TODO: We should rerun the specific indexers that have failed + if scan_index_errors: + error_message += '\n'.join(scan_index_errors) + scannable_uri.index_error + scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.wip_date = None scannable_uri.save() From 0de3b24b9d1baee7b5785f170267eee6768d99ed Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 16 Jun 2023 18:31:14 -0700 Subject: [PATCH 05/15] Set API serializers to have read_only fields Signed-off-by: Jono Yang --- packagedb/serializers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packagedb/serializers.py b/packagedb/serializers.py index e1eda130..374dd7eb 100644 --- a/packagedb/serializers.py +++ b/packagedb/serializers.py @@ -61,6 +61,7 @@ class Meta: 'urls', 'extra_data', ) + read_only_fields = fields class ResourceMetadataSerializer(HyperlinkedModelSerializer): @@ -186,6 +187,7 @@ class Meta: 'dependencies', 'resources', ) + read_only_fields = fields def get_package_content(self, obj): return obj.get_package_content_display() From f6c7fbe393687973950b7d534193f3ef00fc2572 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 21 Jun 2023 14:52:25 -0700 Subject: [PATCH 06/15] Increase REQUEST_TIMEOUT to 15 Signed-off-by: Jono Yang --- minecode/management/scanning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/minecode/management/scanning.py b/minecode/management/scanning.py index f75f7a63..34d1979c 100644 --- a/minecode/management/scanning.py +++ b/minecode/management/scanning.py @@ -26,7 +26,7 @@ SLEEP_WHEN_EMPTY = 1 # in seconds -REQUEST_TIMEOUT = 3 +REQUEST_TIMEOUT = 15 # Only SCANCODEIO_URL can be provided through setting SCANCODEIO_URL = settings.SCANCODEIO_URL @@ -383,7 +383,7 @@ def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): while True: # Wait before processing anything - time.sleep(REQUEST_TIMEOUT) + time.sleep(3) if cls.MUST_STOP: cls.logger.info('Graceful exit of the scan processing loop.') From f8d79b005628acd1ece4d0f30d35d037395a2a12 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 22 Jun 2023 12:31:09 -0700 Subject: [PATCH 07/15] Break out directory fingerprinting logic #69 Signed-off-by: Jono Yang --- .../src/matchcode_toolkit/fingerprinting.py | 46 +++++++++++-------- .../pipelines/scan_and_fingerprint_package.py | 4 +- .../matchcode_toolkit/plugin_fingerprint.py | 4 +- .../tests/test_fingerprinting.py | 6 +-- matchcode/indexing.py | 4 +- matchcode/tests/test_index_packages.py | 4 +- matchcode/tests/test_match.py | 4 +- matchcode/tests/test_models.py | 6 +-- 8 files changed, 43 insertions(+), 35 deletions(-) diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py index 9c72ba5e..19e6a03b 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py +++ b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py @@ -69,30 +69,38 @@ def create_structure_fingerprint(directory, children): return _create_directory_fingerprint(features) -def compute_directory_fingerprints(codebase): +def compute_directory_fingerprints(directory, codebase): """ - Compute fingerprints for a directory from `codebase` + Compute fingerprints for `directory` from `codebase` """ - for resource in codebase.walk(topdown=False): - if resource.is_file or not resource.path: - continue - children = [r for r in resource.walk(codebase) if r.is_file] - if len(children) == 1: - continue + children = [r for r in directory.walk(codebase) if r.is_file] + if len(children) == 1: + return - directory_content_fingerprint = create_content_fingerprint(children) - if hasattr(resource, 'directory_content_fingerprint'): - resource.directory_content_fingerprint = directory_content_fingerprint - else: - resource.extra_data['directory_content'] = directory_content_fingerprint + directory_content_fingerprint = create_content_fingerprint(children) + if hasattr(directory, 'directory_content_fingerprint'): + directory.directory_content_fingerprint = directory_content_fingerprint + else: + directory.extra_data['directory_content'] = directory_content_fingerprint - directory_structure_fingerprint = create_structure_fingerprint(resource, children) - if hasattr(resource, 'directory_structure_fingerprint'): - resource.directory_structure_fingerprint = directory_structure_fingerprint - else: - resource.extra_data['directory_structure'] = create_structure_fingerprint(resource, children) + directory_structure_fingerprint = create_structure_fingerprint(directory, children) + if hasattr(directory, 'directory_structure_fingerprint'): + directory.directory_structure_fingerprint = directory_structure_fingerprint + else: + directory.extra_data['directory_structure'] = create_structure_fingerprint(directory, children) + + directory.save(codebase) + return directory - resource.save(codebase) + +def compute_codebase_directory_fingerprints(codebase): + """ + Compute fingerprints for directories from `codebase` + """ + for resource in codebase.walk(topdown=False): + if resource.is_file or not resource.path: + continue + _ = compute_directory_fingerprints(resource, codebase) return codebase diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py index 842e5a95..7bc5e13b 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py +++ b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py @@ -20,7 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from scanpipe.pipelines.scan_package import ScanPackage from scanpipe.pipes.codebase import ProjectCodebase @@ -63,4 +63,4 @@ def fingerprint_codebase(self): Compute directory fingerprints for matching purposes """ project_codebase = ProjectCodebase(self.project) - compute_directory_fingerprints(project_codebase) + compute_codebase_directory_fingerprints(project_codebase) diff --git a/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py b/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py index fba3e7d1..d55e4bb7 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py +++ b/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py @@ -11,7 +11,7 @@ from commoncode.cliutils import PluggableCommandLineOption from commoncode.cliutils import POST_SCAN_GROUP -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from plugincode.post_scan import post_scan_impl from plugincode.post_scan import PostScanPlugin @@ -41,4 +41,4 @@ def is_enabled(self, fingerprint, **kwargs): return fingerprint def process_codebase(self, codebase, **kwargs): - codebase = compute_directory_fingerprints(codebase) + codebase = compute_codebase_directory_fingerprints(codebase) diff --git a/matchcode-toolkit/tests/test_fingerprinting.py b/matchcode-toolkit/tests/test_fingerprinting.py index 761691f5..5b37de8f 100644 --- a/matchcode-toolkit/tests/test_fingerprinting.py +++ b/matchcode-toolkit/tests/test_fingerprinting.py @@ -14,7 +14,7 @@ from matchcode_toolkit.fingerprinting import _create_directory_fingerprint from matchcode_toolkit.fingerprinting import _get_resource_subpath -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import create_content_fingerprint from matchcode_toolkit.fingerprinting import create_halohash_chunks from matchcode_toolkit.fingerprinting import create_structure_fingerprint @@ -95,10 +95,10 @@ def test_create_halohash_chunks(self): self.assertEqual(chunk3, expected_chunk3) self.assertEqual(chunk4, expected_chunk4) - def test_compute_directory_fingerprints(self): + def test_compute_codebase_directory_fingerprints(self): scan_loc = self.get_test_loc('abbrev-1.0.3-i.json') vc = VirtualCodebase(location=scan_loc) - vc = compute_directory_fingerprints(vc) + vc = compute_codebase_directory_fingerprints(vc) directory_content = vc.root.extra_data['directory_content'] directory_structure = vc.root.extra_data['directory_structure'] expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b' diff --git a/matchcode/indexing.py b/matchcode/indexing.py index e4289031..107dd698 100644 --- a/matchcode/indexing.py +++ b/matchcode/indexing.py @@ -13,7 +13,7 @@ from commoncode.resource import VirtualCodebase -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex from matchcode.models import ExactPackageArchiveIndex @@ -150,5 +150,5 @@ def index_package_directories(package): if not vc: return 0, 0 - vc = compute_directory_fingerprints(vc) + vc = compute_codebase_directory_fingerprints(vc) return index_directory_fingerprints(vc, package) diff --git a/matchcode/tests/test_index_packages.py b/matchcode/tests/test_index_packages.py index f6850d33..c1db948c 100644 --- a/matchcode/tests/test_index_packages.py +++ b/matchcode/tests/test_index_packages.py @@ -11,7 +11,7 @@ from commoncode.resource import VirtualCodebase -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode.indexing import _create_virtual_codebase_from_package_resources from matchcode.indexing import index_directory_fingerprints @@ -155,7 +155,7 @@ def test__create_virtual_codebase_from_package_resources(self): def test_index_directory_fingerprints(self): vc = _create_virtual_codebase_from_package_resources(self.test_package1) - vc = compute_directory_fingerprints(vc) + vc = compute_codebase_directory_fingerprints(vc) # Ensure tables are empty prior to indexing self.assertFalse(ApproximateDirectoryContentIndex.objects.all()) diff --git a/matchcode/tests/test_match.py b/matchcode/tests/test_match.py index 0488edcb..3b12e7d2 100644 --- a/matchcode/tests/test_match.py +++ b/matchcode/tests/test_match.py @@ -13,7 +13,7 @@ from commoncode.resource import VirtualCodebase from packagedb.models import Package -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode.management.commands.index_packages import index_package_directories from matchcode.match import EXACT_PACKAGE_ARCHIVE_MATCH from matchcode.match import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH @@ -37,7 +37,7 @@ def run_do_match_from_scan(scan_file_location, match_type): matched_to=attr.ib(default=attr.Factory(list)) ) ) - vc = compute_directory_fingerprints(vc) + vc = compute_codebase_directory_fingerprints(vc) do_match(vc, match_type) return vc diff --git a/matchcode/tests/test_models.py b/matchcode/tests/test_models.py index ae6b9e06..5e5744f7 100644 --- a/matchcode/tests/test_models.py +++ b/matchcode/tests/test_models.py @@ -13,7 +13,7 @@ from packagedb.models import Package import attr -from matchcode_toolkit.fingerprinting import compute_directory_fingerprints +from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode.management.commands.index_packages import index_package_directories from matchcode.models import ApproximateDirectoryContentIndex @@ -169,7 +169,7 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self): location=scan_location, resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))) ) - codebase = compute_directory_fingerprints(vc) + codebase = compute_codebase_directory_fingerprints(vc) # populate codebase with match results for resource in codebase.walk(topdown=True): @@ -192,7 +192,7 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self): location=scan_location, resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))) ) - codebase = compute_directory_fingerprints(vc) + codebase = compute_codebase_directory_fingerprints(vc) # populate codebase with match results for resource in codebase.walk(topdown=True): From 533bdb2a5b3be557e128772ff0e176f8e316ae1e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 22 Jun 2023 18:00:16 -0700 Subject: [PATCH 08/15] Update compute_directory_fingerprints #118 Signed-off-by: Jono Yang --- matchcode-toolkit/CHANGELOG.rst | 5 +++++ matchcode-toolkit/pyproject.toml | 2 +- matchcode-toolkit/setup.cfg | 2 +- .../src/matchcode_toolkit/fingerprinting.py | 17 ++++++++++++++--- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/matchcode-toolkit/CHANGELOG.rst b/matchcode-toolkit/CHANGELOG.rst index 41cf3e53..123c5fd8 100644 --- a/matchcode-toolkit/CHANGELOG.rst +++ b/matchcode-toolkit/CHANGELOG.rst @@ -1,6 +1,11 @@ Changelog ========= +v1.1.0 +------ + +*2023-06-22* -- Rename ``compute_directory_fingerprints`` to ``compute_codebase_directory_fingerprints`` and create a new version of ``compute_directory_fingerprints`` that works on Resource objects instead of codebases. + v1.0.0 ------ diff --git a/matchcode-toolkit/pyproject.toml b/matchcode-toolkit/pyproject.toml index 8c075d6c..6c493f2e 100644 --- a/matchcode-toolkit/pyproject.toml +++ b/matchcode-toolkit/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "matchcode-toolkit" -version = "1.0.0" +version = "1.1.0" [build-system] requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"] diff --git a/matchcode-toolkit/setup.cfg b/matchcode-toolkit/setup.cfg index eb9c6162..7058803d 100644 --- a/matchcode-toolkit/setup.cfg +++ b/matchcode-toolkit/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = matchcode-toolkit -version = 1.0.0 +version = 1.1.0 license = Apache-2.0 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390 diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py index 19e6a03b..1d9df4a6 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py +++ b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py @@ -69,7 +69,7 @@ def create_structure_fingerprint(directory, children): return _create_directory_fingerprint(features) -def compute_directory_fingerprints(directory, codebase): +def _compute_directory_fingerprints(directory, codebase): """ Compute fingerprints for `directory` from `codebase` """ @@ -87,12 +87,23 @@ def compute_directory_fingerprints(directory, codebase): if hasattr(directory, 'directory_structure_fingerprint'): directory.directory_structure_fingerprint = directory_structure_fingerprint else: - directory.extra_data['directory_structure'] = create_structure_fingerprint(directory, children) + directory.extra_data['directory_structure'] = directory_structure_fingerprint directory.save(codebase) return directory +def compute_directory_fingerprints(directory, codebase): + """ + Recursivly compute fingerprints for `directory` from `codebase` + """ + for resource in directory.walk(codebase, topdown=False): + if resource.is_file: + continue + _ = _compute_directory_fingerprints(resource, codebase) + return directory + + def compute_codebase_directory_fingerprints(codebase): """ Compute fingerprints for directories from `codebase` @@ -100,7 +111,7 @@ def compute_codebase_directory_fingerprints(codebase): for resource in codebase.walk(topdown=False): if resource.is_file or not resource.path: continue - _ = compute_directory_fingerprints(resource, codebase) + _ = _compute_directory_fingerprints(resource, codebase) return codebase From 6ddfb0f4cd6e283ea5e70a6b5d829ad71a50414a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 26 Jun 2023 15:12:02 -0700 Subject: [PATCH 09/15] Bump matchcode-toolkit dep version Signed-off-by: Jono Yang --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 76e5bc30..5fdcf8a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = rubymarshal == 1.0.3 scancode-toolkit[full] == 32.0.1 urlpy == 0.5 - matchcode-toolkit == 1.0.0 + matchcode-toolkit == 1.1.0 setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 From 8768c0dcc4877e427cda48d3b639c306f9d566cd Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 26 Jun 2023 15:25:04 -0700 Subject: [PATCH 10/15] Add expectedFailure tag to test Signed-off-by: Jono Yang --- packagedb/tests/test_api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 7fecd9ba..e39d0542 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from unittest.case import expectedFailure from uuid import uuid4 import json import os @@ -336,6 +337,9 @@ def test_api_package_latest_version_action(self): response = self.client.get(reverse('api:package-latest-version', args=[p3.uuid])) self.assertEqual('3.0', response.data['version']) + # We removed the constraint on (package, path) ordering for possible + # performance increase, but may add it back in + @expectedFailure def test_api_package_resources_action(self): # create 10 resources for i in range(0, 10): From 69e34b448028fda026476393bbd19028d77df1ad Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 28 Jun 2023 12:05:41 -0700 Subject: [PATCH 11/15] Declare scan_index_errors outside of try block Signed-off-by: Jono Yang --- minecode/management/commands/process_scans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minecode/management/commands/process_scans.py b/minecode/management/commands/process_scans.py index 95d25e51..5a55c1a7 100644 --- a/minecode/management/commands/process_scans.py +++ b/minecode/management/commands/process_scans.py @@ -61,6 +61,7 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): scannable_uri.scan_status = get_scan_status(scan_info) elif scannable_uri.scan_status in (ScannableURI.SCAN_COMPLETED,): + scan_index_errors = [] try: logger.info('Indexing scanned files for URI: {}'.format(scannable_uri)) @@ -71,7 +72,7 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa api_auth_headers=cls.api_auth_headers, get_scan_data_save_loc=get_scan_data_save_loc ) - scan_index_errors = index_package_files(package, scan_data) + scan_index_errors.extend(index_package_files(package, scan_data)) summary = scanning.get_scan_summary( scannable_uri.scan_uuid, From 1a1c123780b79fc88e21e42b6b3a588c7330684c Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 28 Jun 2023 13:06:17 -0700 Subject: [PATCH 12/15] Increase timeout when getting large scans * Increase default timeout to 2 minutes Signed-off-by: Jono Yang --- minecode/management/commands/process_scans.py | 7 +++ minecode/management/scanning.py | 46 +++++++++++++++---- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/minecode/management/commands/process_scans.py b/minecode/management/commands/process_scans.py index 5a55c1a7..87235bac 100644 --- a/minecode/management/commands/process_scans.py +++ b/minecode/management/commands/process_scans.py @@ -66,10 +66,17 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa logger.info('Indexing scanned files for URI: {}'.format(scannable_uri)) package = scannable_uri.package + input_size = scan_info.size + if input_size: + computed_timeout = ((input_size / 1000000) / 2) * 60 + timeout = max(computed_timeout, scanning.REQUEST_TIMEOUT) + else: + timeout = scanning.REQUEST_TIMEOUT scan_data = scanning.get_scan_data( scannable_uri.scan_uuid, api_url=cls.api_url, api_auth_headers=cls.api_auth_headers, + timeout=timeout, get_scan_data_save_loc=get_scan_data_save_loc ) scan_index_errors.extend(index_package_files(package, scan_data)) diff --git a/minecode/management/scanning.py b/minecode/management/scanning.py index 34d1979c..e7a6e4a2 100644 --- a/minecode/management/scanning.py +++ b/minecode/management/scanning.py @@ -26,7 +26,7 @@ SLEEP_WHEN_EMPTY = 1 # in seconds -REQUEST_TIMEOUT = 15 +REQUEST_TIMEOUT = 120 # Only SCANCODEIO_URL can be provided through setting SCANCODEIO_URL = settings.SCANCODEIO_URL @@ -80,6 +80,7 @@ class Scan(object): sha512 = attr.ib(default=None) sha1_git = attr.ib(default=None) filename = attr.ib(default=None) + size = attr.ib(default=None) @classmethod def from_response(cls, url, uuid, runs, input_sources, extra_data={}, **kwargs): @@ -107,6 +108,7 @@ def from_response(cls, url, uuid, runs, input_sources, extra_data={}, **kwargs): sha512 = extra_data.get('sha512') sha1_git = extra_data.get('sha1_git') filename = extra_data.get('filename') + size = extra_data.get('size') return Scan( url=url, uuid=uuid, run_uuid=run_uuid, uri=uri, @@ -114,7 +116,7 @@ def from_response(cls, url, uuid, runs, input_sources, extra_data={}, **kwargs): task_end_date=task_end_date, task_exitcode=task_exitcode, status=status, execution_time=execution_time, md5=md5, sha1=sha1, sha256=sha256, sha512=sha512, - sha1_git=sha1_git, filename=filename + sha1_git=sha1_git, filename=filename, size=size ) @property @@ -243,8 +245,13 @@ def get_scan_url(scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, suffix=''): return url -def _call_scan_get_api(scan_uuid, endpoint='', - api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS): +def _call_scan_get_api( + scan_uuid, + endpoint='', + api_url=SCANCODEIO_API_URL_PROJECTS, + api_auth_headers=SCANCODEIO_AUTH_HEADERS, + timeout=REQUEST_TIMEOUT, +): """ Send a get request to the scan API for `scan_uuid` and return response mapping from a JSON response. Call either the plain scan enpoint or the data @@ -252,7 +259,7 @@ def _call_scan_get_api(scan_uuid, endpoint='', exception on error. """ scan_url = get_scan_url(scan_uuid, api_url=api_url, suffix=endpoint) - response = requests.get(url=scan_url, timeout=REQUEST_TIMEOUT, headers=api_auth_headers) + response = requests.get(url=scan_url, timeout=timeout, headers=api_auth_headers) if not response.ok: response.raise_for_status() return response.json() @@ -262,13 +269,20 @@ def _get_scan_info( scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS, + timeout=REQUEST_TIMEOUT, get_scan_info_save_loc='' ): """ Return a mapping of project info for `scan_uuid` fetched from ScanCode.io or None. Raise an exception on error. """ - results = _call_scan_get_api(scan_uuid, endpoint='', api_url=api_url, api_auth_headers=api_auth_headers) + results = _call_scan_get_api( + scan_uuid, + endpoint='', + api_url=api_url, + api_auth_headers=api_auth_headers, + timeout=timeout + ) if get_scan_info_save_loc: with open(get_scan_info_save_loc, 'w') as f: json.dump(results, f) @@ -279,6 +293,7 @@ def get_scan_info( scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS, + timeout=REQUEST_TIMEOUT, get_scan_info_save_loc='' ): """ @@ -289,6 +304,7 @@ def get_scan_info( scan_uuid=scan_uuid, api_url=api_url, api_auth_headers=api_auth_headers, + timeout=timeout, get_scan_info_save_loc=get_scan_info_save_loc, ) return Scan.from_response(**results) @@ -298,6 +314,7 @@ def get_scan_data( scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS, + timeout=REQUEST_TIMEOUT, get_scan_data_save_loc='' ): """ @@ -305,7 +322,13 @@ def get_scan_data( ScanCode.io or None. Raise an exception on error. """ # FIXME: we should return a temp location instead - results = _call_scan_get_api(scan_uuid, endpoint='results', api_url=api_url, api_auth_headers=api_auth_headers) + results = _call_scan_get_api( + scan_uuid, + endpoint='results', + api_url=api_url, + api_auth_headers=api_auth_headers, + timeout=timeout + ) if get_scan_data_save_loc: with open(get_scan_data_save_loc, 'w') as f: json.dump(results, f) @@ -316,6 +339,7 @@ def get_scan_summary( scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS, + timeout=REQUEST_TIMEOUT, get_scan_data_save_loc='' ): """ @@ -323,7 +347,13 @@ def get_scan_summary( ScanCode.io or None. Raise an exception on error. """ # FIXME: we should return a temp location instead - results = _call_scan_get_api(scan_uuid, endpoint='summary', api_url=api_url, api_auth_headers=api_auth_headers) + results = _call_scan_get_api( + scan_uuid, + endpoint='summary', + api_url=api_url, + api_auth_headers=api_auth_headers, + timeout=timeout + ) if get_scan_data_save_loc: with open(get_scan_data_save_loc, 'w') as f: json.dump(results, f) From f126ab7a1aa84adbd2e81ef1f449581d1a012c4e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 29 Jun 2023 11:57:36 -0700 Subject: [PATCH 13/15] Update matchcode-toolkit #95 * Don't add empty files to directory fingerprints * Bump matchcode-toolkit version Signed-off-by: Jono Yang --- matchcode-toolkit/CHANGELOG.rst | 5 +++++ matchcode-toolkit/pyproject.toml | 2 +- matchcode-toolkit/setup.cfg | 2 +- matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py | 3 ++- minecode/tests/test_scanning.py | 1 + setup.cfg | 2 +- 6 files changed, 11 insertions(+), 4 deletions(-) diff --git a/matchcode-toolkit/CHANGELOG.rst b/matchcode-toolkit/CHANGELOG.rst index 123c5fd8..ee235eea 100644 --- a/matchcode-toolkit/CHANGELOG.rst +++ b/matchcode-toolkit/CHANGELOG.rst @@ -1,6 +1,11 @@ Changelog ========= +v1.1.1 +------ + +*2023-06-29* -- Do not include empty files when computing directory fingerprints. + v1.1.0 ------ diff --git a/matchcode-toolkit/pyproject.toml b/matchcode-toolkit/pyproject.toml index 6c493f2e..b64839f2 100644 --- a/matchcode-toolkit/pyproject.toml +++ b/matchcode-toolkit/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "matchcode-toolkit" -version = "1.1.0" +version = "1.1.1" [build-system] requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"] diff --git a/matchcode-toolkit/setup.cfg b/matchcode-toolkit/setup.cfg index 7058803d..9d0f6c84 100644 --- a/matchcode-toolkit/setup.cfg +++ b/matchcode-toolkit/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = matchcode-toolkit -version = 1.1.0 +version = 1.1.1 license = Apache-2.0 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390 diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py index 1d9df4a6..943e6f97 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py +++ b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py @@ -73,7 +73,8 @@ def _compute_directory_fingerprints(directory, codebase): """ Compute fingerprints for `directory` from `codebase` """ - children = [r for r in directory.walk(codebase) if r.is_file] + # We do not want to add empty files to our fingerprint + children = [r for r in directory.walk(codebase) if r.is_file and r.size] if len(children) == 1: return diff --git a/minecode/tests/test_scanning.py b/minecode/tests/test_scanning.py index 1099f4d4..a5203740 100644 --- a/minecode/tests/test_scanning.py +++ b/minecode/tests/test_scanning.py @@ -150,6 +150,7 @@ def testscanning_get_scan_info(self, mock_get): sha512='4431f237bcdfee5d2b86b1b3f01c8abaa160d5b7007c63e6281845a3f920d89fdb2e4044f97694ddef91e174d9dd30e5016bbad46eec2d68af200a47e9cedd85', sha1_git='ad18d88bdae8449e7c170f8e7db1bfe336dbb4e0', filename='wagon-api-20040705.181715.jar', + size=47069, ) expected = attr.asdict(expected) result = attr.asdict(result) diff --git a/setup.cfg b/setup.cfg index 5fdcf8a2..681b6fe7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = rubymarshal == 1.0.3 scancode-toolkit[full] == 32.0.1 urlpy == 0.5 - matchcode-toolkit == 1.1.0 + matchcode-toolkit == 1.1.1 setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 From a0ba04fa5e5028526b0e565087d4efedc5522ee2 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 29 Jun 2023 12:34:12 -0700 Subject: [PATCH 14/15] Order Resources by id Signed-off-by: Jono Yang --- packagedb/models.py | 1 + packagedb/tests/test_api.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/packagedb/models.py b/packagedb/models.py index 16fb6c75..e940e543 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -909,6 +909,7 @@ class Meta: unique_together = ( ('package', 'path'), ) + ordering = ('id',) indexes = [ models.Index(fields=['md5']), models.Index(fields=['sha1']), diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index e39d0542..1bb36ee4 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -337,9 +337,6 @@ def test_api_package_latest_version_action(self): response = self.client.get(reverse('api:package-latest-version', args=[p3.uuid])) self.assertEqual('3.0', response.data['version']) - # We removed the constraint on (package, path) ordering for possible - # performance increase, but may add it back in - @expectedFailure def test_api_package_resources_action(self): # create 10 resources for i in range(0, 10): From 798dd8738dfbdcbbdc5edc9f6a8ebdde57fc1174 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 29 Jun 2023 14:50:19 -0700 Subject: [PATCH 15/15] Order results by URI Signed-off-by: Jono Yang --- minecode/tests/test_migrations.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/minecode/tests/test_migrations.py b/minecode/tests/test_migrations.py index ac9961a3..2f930856 100644 --- a/minecode/tests/test_migrations.py +++ b/minecode/tests/test_migrations.py @@ -90,16 +90,9 @@ def test_populate_has_error_fields(self): "map_error", "has_visit_error", "visit_error", - ).all() + ).order_by('uri') ) expected = [ - { - 'has_map_error': False, - 'has_visit_error': False, - 'map_error': None, - 'uri': 'http://example.com/4', - 'visit_error': None - }, { 'has_map_error': True, 'has_visit_error': True, @@ -120,7 +113,14 @@ def test_populate_has_error_fields(self): 'map_error': 'error', 'uri': 'http://example.com/3', 'visit_error': None - } + }, + { + 'has_map_error': False, + 'has_visit_error': False, + 'map_error': None, + 'uri': 'http://example.com/4', + 'visit_error': None + }, ] self.assertEquals(results, expected)