diff --git a/matchcode-toolkit/CHANGELOG.rst b/matchcode-toolkit/CHANGELOG.rst index ee235eea..541ddfa8 100644 --- a/matchcode-toolkit/CHANGELOG.rst +++ b/matchcode-toolkit/CHANGELOG.rst @@ -1,6 +1,11 @@ Changelog ========= +v1.1.2 +------ + +*2023-08-02* -- Update ``scan_and_fingerprint_package`` pipeline to use new directory fingerprinting functions from scancode.io. + v1.1.1 ------ diff --git a/matchcode-toolkit/setup.cfg b/matchcode-toolkit/setup.cfg index 9d0f6c84..4cfec167 100644 --- a/matchcode-toolkit/setup.cfg +++ b/matchcode-toolkit/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = matchcode-toolkit -version = 1.1.1 +version = 1.1.2 license = Apache-2.0 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390 diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py index 7bc5e13b..0f15d13a 100644 --- a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py +++ b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py @@ -20,10 +20,8 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. -from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints - from scanpipe.pipelines.scan_package import ScanPackage -from scanpipe.pipes.codebase import ProjectCodebase +from scanpipe.pipes import matchcode class ScanAndFingerprintPackage(ScanPackage): @@ -62,5 +60,4 @@ def fingerprint_codebase(self): """ Compute directory fingerprints for matching purposes """ - project_codebase = ProjectCodebase(self.project) - compute_codebase_directory_fingerprints(project_codebase) + matchcode.fingerprint_codebase_directories(self.project) diff --git a/minecode/utils_test.py b/minecode/utils_test.py index 86731b4c..9da25fd1 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -25,6 +25,7 @@ from django.db.migrations.executor import MigrationExecutor from django.test import TestCase as DjangoTestCase from rest_framework.utils.serializer_helpers import ReturnDict +from rest_framework.utils.serializer_helpers import ReturnList from commoncode.testcase import FileBasedTesting from scancode.cli_test_utils import purl_with_fake_uuid @@ -206,13 +207,13 @@ def _normalize_results(self, data, fields_to_remove=[]): with `purl_with_fake_uuid()` and fields from `fields_to_remove` have been removed from `data`. """ - if type(data) == list: + if type(data) in (list, ReturnList): return [self._normalize_results(entry, fields_to_remove) for entry in data] if type(data) in (dict, OrderedDict, ReturnDict): normalized_data = {} for key, value in data.items(): - if type(value) in [list, dict, OrderedDict, ReturnDict]: + if type(value) in (list, ReturnList, dict, OrderedDict, ReturnDict): value = self._normalize_results(value, fields_to_remove) if ( key in ("package_uid", "dependency_uid", "for_package_uid") @@ -229,7 +230,7 @@ def _normalize_results(self, data, fields_to_remove=[]): return data def _remove_fields_from_results(self, data, fields_to_remove): - if type(data) == list: + if type(data) in (list, ReturnList): return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data] if type(data) in (dict, OrderedDict, ReturnDict): diff --git a/packagedb/api.py b/packagedb/api.py index 330eb723..43df7aa3 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -87,6 +87,63 @@ class ResourceViewSet(viewsets.ReadOnlyModelViewSet): filterset_class = ResourceFilter lookup_field = 'sha1' + @action(detail=False, methods=['post']) + def filter_by_checksums(self, request, *args, **kwargs): + """ + Take a mapping, where the keys are the names of the checksum algorthm + and the values is a list of checksum values and query those values + against the packagedb. + + Supported checksum fields are: + - md5 + - sha1 + + Example: + { + "sha1": [ + "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", + "27afff2610b5a94274a2311f8b15e514446b0e76 + ] + } + + Multiple checksums algorithms can be passed together: + { + "sha1": [ + "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", + "27afff2610b5a94274a2311f8b15e514446b0e76 + ], + "md5": [ + "e927df60b093456d4e611ae235c1aa5b" + ] + } + + This will return Resources whose sha1 or md5 matches those values. + """ + data = dict(request.data) + unsupported_fields = [] + for field, value in data.items(): + if field not in ('md5', 'sha1'): + unsupported_fields.append(field) + + if unsupported_fields: + unsupported_fields_str = ', '.join(unsupported_fields) + response_data = { + 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + } + return Response(response_data) + + q = Q() + for field, value in data.items(): + # We create this intermediate dictionary so we can modify the field + # name to have __in at the end + d = {f'{field}__in': value} + q |= Q(**d) + + qs = Resource.objects.filter(q) + paginated_qs = self.paginate_queryset(qs) + serializer = ResourceAPISerializer(paginated_qs, many=True, context={'request': request}) + return self.get_paginated_response(serializer.data) + class MultiplePackageURLFilter(Filter): def filter(self, qs, value): @@ -218,7 +275,7 @@ def resources(self, request, *args, **kwargs): paginated_qs = self.paginate_queryset(qs) serializer = ResourceAPISerializer(paginated_qs, many=True, context={'request': request}) - return Response(serializer.data) + return self.get_paginated_response(serializer.data) @action(detail=False) def get_package(self, request, *args, **kwargs): @@ -409,6 +466,65 @@ def _reindex_package(package, reindexed_packages): } return Response(response_data) + @action(detail=False, methods=['post']) + def filter_by_checksums(self, request, *args, **kwargs): + """ + Take a mapping, where the keys are the names of the checksum algorthm + and the values is a list of checksum values and query those values + against the packagedb. + + Supported checksum fields are: + - md5 + - sha1 + - sha256 + - sha512 + + Example: + { + "sha1": [ + "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", + "27afff2610b5a94274a2311f8b15e514446b0e76 + ] + } + + Multiple checksums algorithms can be passed together: + { + "sha1": [ + "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", + "27afff2610b5a94274a2311f8b15e514446b0e76 + ], + "md5": [ + "e927df60b093456d4e611ae235c1aa5b" + ] + } + + This will return Packages whose sha1 or md5 matches those values. + """ + data = dict(request.data) + unsupported_fields = [] + for field, value in data.items(): + if field not in ('md5', 'sha1', 'sha256', 'sha512'): + unsupported_fields.append(field) + + if unsupported_fields: + unsupported_fields_str = ', '.join(unsupported_fields) + response_data = { + 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + } + return Response(response_data) + + q = Q() + for field, value in data.items(): + # We create this intermediate dictionary so we can modify the field + # name to have __in at the end + d = {f'{field}__in': value} + q |= Q(**d) + + qs = Package.objects.filter(q) + paginated_qs = self.paginate_queryset(qs) + serializer = PackageAPISerializer(paginated_qs, many=True, context={'request': request}) + return self.get_paginated_response(serializer.data) + UPDATEABLE_FIELDS = [ 'primary_language', diff --git a/packagedb/api_custom.py b/packagedb/api_custom.py index 5686311f..05d73da2 100644 --- a/packagedb/api_custom.py +++ b/packagedb/api_custom.py @@ -17,6 +17,6 @@ class PageSizePagination(PageNumberPagination): For example: http://api.example.org/accounts/?page=4&page_size=100 """ - page_size = 10 + page_size = 100 max_page_size = 100 page_size_query_param = 'page_size' diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index ec1a3e43..8311e9e4 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -27,7 +27,8 @@ from packagedb.models import Resource -class ResourceAPITestCase(TestCase): +class ResourceAPITestCase(JsonBasedTesting, TestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): self.package1 = Package.objects.create( @@ -194,8 +195,21 @@ def test_api_resource_list_endpoint_filters_by_package2_purl(self): self.assertEqual(test_resource.get('extra_data'), self.resource2.extra_data) self.assertEqual(test_resource.get('type'), self.resource2.type) + def test_api_resource_filter_by_checksums(self): + sha1s = [ + 'testsha11', + 'testsha12', + ] + data = { + 'sha1': sha1s + } + response = self.client.post('/api/resources/filter_by_checksums/', data=data) + self.assertEqual(2, response.data['count']) + expected = self.get_test_loc('api/resource-filter_by_checksums-expected.json') + self.check_expected_results(response.data['results'], expected, fields_to_remove=["url", "uuid", "package"], regen=False) -class PackageApiTestCase(TestCase): +class PackageApiTestCase(JsonBasedTesting, TestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): @@ -348,9 +362,9 @@ def test_api_package_resources_action(self): response = self.client.get(reverse('api:package-resources', args=[self.package.uuid])) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(10, len(response.data)) + self.assertEqual(10, response.data['count']) - for result, i in zip(response.data, range(0, 10)): + for result, i in zip(response.data['results'], range(0, 10)): self.assertEqual(result.get('path'), 'path{}/'.format(i)) def test_api_package_list_endpoint_multiple_char_filters(self): @@ -429,6 +443,20 @@ def test_package_api_index_packages_endpoint(self): ] self.assertEqual(expected_unsupported_packages, response.data['unsupported_packages']) + def test_package_api_filter_by_checksums(self): + sha1s = [ + 'testsha1', + 'testsha1-2', + 'testsha1-3', + ] + data = { + 'sha1': sha1s + } + response = self.client.post('/api/packages/filter_by_checksums/', data=data) + self.assertEqual(3, response.data['count']) + expected = self.get_test_loc('api/package-filter_by_checksums-expected.json') + self.check_expected_results(response.data['results'], expected, fields_to_remove=["url", "uuid", "resources"], regen=False) + class PackageApiReindexingTestCase(JsonBasedTesting, TestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') diff --git a/packagedb/tests/testfiles/api/package-filter_by_checksums-expected.json b/packagedb/tests/testfiles/api/package-filter_by_checksums-expected.json new file mode 100644 index 00000000..a01c743b --- /dev/null +++ b/packagedb/tests/testfiles/api/package-filter_by_checksums-expected.json @@ -0,0 +1,140 @@ +[ + { + "filename":"Foo.zip", + "package_sets":[], + "package_content":null, + "purl":"pkg:generic/generic/foo@12.34?test_qual=qual#test_subpath", + "type":"generic", + "namespace":"generic", + "name":"foo", + "version":"12.34", + "qualifiers":"test_qual=qual", + "subpath":"test_subpath", + "primary_language":null, + "description":null, + "release_date":null, + "parties":[], + "keywords":[], + "homepage_url":null, + "download_url":"http://example.com", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":101, + "md5":"testmd5", + "sha1":"testsha1", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":null, + "declared_license_expression_spdx":null, + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[], + "extra_data":{}, + "package_uid":"pkg:generic/generic/foo@12.34?test_qual=qual&uuid=fixed-uid-done-for-testing-5642512d1758#test_subpath", + "datasource_id":null, + "file_references":[], + "dependencies":[] + }, + { + "filename":"Bar.zip", + "package_sets":[], + "package_content":null, + "purl":"pkg:npm/example/bar@56.78", + "type":"npm", + "namespace":"example", + "name":"bar", + "version":"56.78", + "qualifiers":"", + "subpath":"", + "primary_language":null, + "description":null, + "release_date":null, + "parties":[], + "keywords":[], + "homepage_url":null, + "download_url":"http://somethingelse.org", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":100, + "md5":"testmd5-2", + "sha1":"testsha1-2", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":null, + "declared_license_expression_spdx":null, + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[], + "extra_data":{}, + "package_uid":"pkg:npm/example/bar@56.78?uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[] + }, + { + "filename":"Baz.zip", + "package_sets":[], + "package_content":null, + "purl":"pkg:jar/sample/baz@90.12", + "type":"jar", + "namespace":"sample", + "name":"baz", + "version":"90.12", + "qualifiers":"", + "subpath":"", + "primary_language":null, + "description":null, + "release_date":null, + "parties":[], + "keywords":[], + "homepage_url":null, + "download_url":"http://anotherexample.com", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":100, + "md5":"testmd5-3", + "sha1":"testsha1-3", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":null, + "declared_license_expression_spdx":null, + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[], + "extra_data":{}, + "package_uid":"pkg:jar/sample/baz@90.12?uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[] + } +] \ No newline at end of file diff --git a/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json b/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json new file mode 100644 index 00000000..325beb1c --- /dev/null +++ b/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json @@ -0,0 +1,68 @@ +[ + { + "purl":"pkg:type1/name1", + "path":"package1/contents1.txt", + "type":"file", + "name":"", + "extension":"", + "size":101, + "md5":"testmd51", + "sha1":"testsha11", + "sha256":"testsha2561", + "sha512":"testsha5121", + "git_sha1":"testgit_sha11", + "mime_type":"", + "file_type":"", + "programming_language":"", + "is_binary":false, + "is_text":false, + "is_archive":false, + "is_media":false, + "is_key_file":false, + "detected_license_expression":"", + "detected_license_expression_spdx":"", + "license_detections":[], + "license_clues":[], + "percentage_of_license_text":null, + "copyrights":[], + "holders":[], + "authors":[], + "package_data":[], + "emails":[], + "urls":[], + "extra_data":"{\"test1\": \"data1\"}" + }, + { + "purl":"pkg:type2/name2", + "path":"package2/contents2.txt", + "type":"file", + "name":"", + "extension":"", + "size":102, + "md5":"testmd52", + "sha1":"testsha12", + "sha256":"testsha2562", + "sha512":"testsha5122", + "git_sha1":"testgit_sha12", + "mime_type":"", + "file_type":"", + "programming_language":"", + "is_binary":false, + "is_text":false, + "is_archive":false, + "is_media":false, + "is_key_file":false, + "detected_license_expression":"", + "detected_license_expression_spdx":"", + "license_detections":[], + "license_clues":[], + "percentage_of_license_text":null, + "copyrights":[], + "holders":[], + "authors":[], + "package_data":[], + "emails":[], + "urls":[], + "extra_data":"{\"test2\": \"data2\"}" + } +] \ No newline at end of file diff --git a/purldb/settings.py b/purldb/settings.py index 648ed3da..e12b5dde 100644 --- a/purldb/settings.py +++ b/purldb/settings.py @@ -89,6 +89,10 @@ WSGI_APPLICATION = "purldb.wsgi.application" +SECURE_PROXY_SSL_HEADER = env.tuple( + "SECURE_PROXY_SSL_HEADER", default=("HTTP_X_FORWARDED_PROTO", "https") +) + # Database DATABASES = { @@ -238,7 +242,7 @@ ), 'DEFAULT_PAGINATION_CLASS': 'packagedb.api_custom.PageSizePagination', # Limit the load on the Database returning a small number of records by default. https://github.com/nexB/vulnerablecode/issues/819 - "PAGE_SIZE": 10, + "PAGE_SIZE": 100, } if not PURLDB_REQUIRE_AUTHENTICATION: