Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions matchcode-toolkit/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

v1.1.2
------

*2023-08-02* -- Update ``scan_and_fingerprint_package`` pipeline to use new directory fingerprinting functions from scancode.io.

v1.1.1
------

Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = matchcode-toolkit
version = 1.1.1
version = 1.1.2
license = Apache-2.0

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints

from scanpipe.pipelines.scan_package import ScanPackage
from scanpipe.pipes.codebase import ProjectCodebase
from scanpipe.pipes import matchcode


class ScanAndFingerprintPackage(ScanPackage):
Expand Down Expand Up @@ -62,5 +60,4 @@ def fingerprint_codebase(self):
"""
Compute directory fingerprints for matching purposes
"""
project_codebase = ProjectCodebase(self.project)
compute_codebase_directory_fingerprints(project_codebase)
matchcode.fingerprint_codebase_directories(self.project)
7 changes: 4 additions & 3 deletions minecode/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from django.db.migrations.executor import MigrationExecutor
from django.test import TestCase as DjangoTestCase
from rest_framework.utils.serializer_helpers import ReturnDict
from rest_framework.utils.serializer_helpers import ReturnList

from commoncode.testcase import FileBasedTesting
from scancode.cli_test_utils import purl_with_fake_uuid
Expand Down Expand Up @@ -206,13 +207,13 @@ def _normalize_results(self, data, fields_to_remove=[]):
with `purl_with_fake_uuid()` and fields from `fields_to_remove` have
been removed from `data`.
"""
if type(data) == list:
if type(data) in (list, ReturnList):
return [self._normalize_results(entry, fields_to_remove) for entry in data]

if type(data) in (dict, OrderedDict, ReturnDict):
normalized_data = {}
for key, value in data.items():
if type(value) in [list, dict, OrderedDict, ReturnDict]:
if type(value) in (list, ReturnList, dict, OrderedDict, ReturnDict):
value = self._normalize_results(value, fields_to_remove)
if (
key in ("package_uid", "dependency_uid", "for_package_uid")
Expand All @@ -229,7 +230,7 @@ def _normalize_results(self, data, fields_to_remove=[]):
return data

def _remove_fields_from_results(self, data, fields_to_remove):
if type(data) == list:
if type(data) in (list, ReturnList):
return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data]

if type(data) in (dict, OrderedDict, ReturnDict):
Expand Down
118 changes: 117 additions & 1 deletion packagedb/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,63 @@ class ResourceViewSet(viewsets.ReadOnlyModelViewSet):
filterset_class = ResourceFilter
lookup_field = 'sha1'

@action(detail=False, methods=['post'])
def filter_by_checksums(self, request, *args, **kwargs):
"""
Take a mapping, where the keys are the names of the checksum algorthm
and the values is a list of checksum values and query those values
against the packagedb.

Supported checksum fields are:
- md5
- sha1

Example:
{
"sha1": [
"b55fd82f80cc1bd0bdabf9c6e3153788d35d7911",
"27afff2610b5a94274a2311f8b15e514446b0e76
]
}

Multiple checksums algorithms can be passed together:
{
"sha1": [
"b55fd82f80cc1bd0bdabf9c6e3153788d35d7911",
"27afff2610b5a94274a2311f8b15e514446b0e76
],
"md5": [
"e927df60b093456d4e611ae235c1aa5b"
]
}

This will return Resources whose sha1 or md5 matches those values.
"""
data = dict(request.data)
unsupported_fields = []
for field, value in data.items():
if field not in ('md5', 'sha1'):
unsupported_fields.append(field)

if unsupported_fields:
unsupported_fields_str = ', '.join(unsupported_fields)
response_data = {
'status': f'Unsupported field(s) given: {unsupported_fields_str}'
}
return Response(response_data)

q = Q()
for field, value in data.items():
# We create this intermediate dictionary so we can modify the field
# name to have __in at the end
d = {f'{field}__in': value}
q |= Q(**d)

qs = Resource.objects.filter(q)
paginated_qs = self.paginate_queryset(qs)
serializer = ResourceAPISerializer(paginated_qs, many=True, context={'request': request})
return self.get_paginated_response(serializer.data)


class MultiplePackageURLFilter(Filter):
def filter(self, qs, value):
Expand Down Expand Up @@ -218,7 +275,7 @@ def resources(self, request, *args, **kwargs):
paginated_qs = self.paginate_queryset(qs)

serializer = ResourceAPISerializer(paginated_qs, many=True, context={'request': request})
return Response(serializer.data)
return self.get_paginated_response(serializer.data)

@action(detail=False)
def get_package(self, request, *args, **kwargs):
Expand Down Expand Up @@ -409,6 +466,65 @@ def _reindex_package(package, reindexed_packages):
}
return Response(response_data)

@action(detail=False, methods=['post'])
def filter_by_checksums(self, request, *args, **kwargs):
"""
Take a mapping, where the keys are the names of the checksum algorthm
and the values is a list of checksum values and query those values
against the packagedb.

Supported checksum fields are:
- md5
- sha1
- sha256
- sha512

Example:
{
"sha1": [
"b55fd82f80cc1bd0bdabf9c6e3153788d35d7911",
"27afff2610b5a94274a2311f8b15e514446b0e76
]
}

Multiple checksums algorithms can be passed together:
{
"sha1": [
"b55fd82f80cc1bd0bdabf9c6e3153788d35d7911",
"27afff2610b5a94274a2311f8b15e514446b0e76
],
"md5": [
"e927df60b093456d4e611ae235c1aa5b"
]
}

This will return Packages whose sha1 or md5 matches those values.
"""
data = dict(request.data)
unsupported_fields = []
for field, value in data.items():
if field not in ('md5', 'sha1', 'sha256', 'sha512'):
unsupported_fields.append(field)

if unsupported_fields:
unsupported_fields_str = ', '.join(unsupported_fields)
response_data = {
'status': f'Unsupported field(s) given: {unsupported_fields_str}'
}
return Response(response_data)

q = Q()
for field, value in data.items():
# We create this intermediate dictionary so we can modify the field
# name to have __in at the end
d = {f'{field}__in': value}
q |= Q(**d)

qs = Package.objects.filter(q)
paginated_qs = self.paginate_queryset(qs)
serializer = PackageAPISerializer(paginated_qs, many=True, context={'request': request})
return self.get_paginated_response(serializer.data)


UPDATEABLE_FIELDS = [
'primary_language',
Expand Down
2 changes: 1 addition & 1 deletion packagedb/api_custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ class PageSizePagination(PageNumberPagination):
For example:
http://api.example.org/accounts/?page=4&page_size=100
"""
page_size = 10
page_size = 100
max_page_size = 100
page_size_query_param = 'page_size'
36 changes: 32 additions & 4 deletions packagedb/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from packagedb.models import Resource


class ResourceAPITestCase(TestCase):
class ResourceAPITestCase(JsonBasedTesting, TestCase):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles')

def setUp(self):
self.package1 = Package.objects.create(
Expand Down Expand Up @@ -194,8 +195,21 @@ def test_api_resource_list_endpoint_filters_by_package2_purl(self):
self.assertEqual(test_resource.get('extra_data'), self.resource2.extra_data)
self.assertEqual(test_resource.get('type'), self.resource2.type)

def test_api_resource_filter_by_checksums(self):
sha1s = [
'testsha11',
'testsha12',
]
data = {
'sha1': sha1s
}
response = self.client.post('/api/resources/filter_by_checksums/', data=data)
self.assertEqual(2, response.data['count'])
expected = self.get_test_loc('api/resource-filter_by_checksums-expected.json')
self.check_expected_results(response.data['results'], expected, fields_to_remove=["url", "uuid", "package"], regen=False)

class PackageApiTestCase(TestCase):
class PackageApiTestCase(JsonBasedTesting, TestCase):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles')

def setUp(self):

Expand Down Expand Up @@ -348,9 +362,9 @@ def test_api_package_resources_action(self):
response = self.client.get(reverse('api:package-resources', args=[self.package.uuid]))

self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(10, len(response.data))
self.assertEqual(10, response.data['count'])

for result, i in zip(response.data, range(0, 10)):
for result, i in zip(response.data['results'], range(0, 10)):
self.assertEqual(result.get('path'), 'path{}/'.format(i))

def test_api_package_list_endpoint_multiple_char_filters(self):
Expand Down Expand Up @@ -429,6 +443,20 @@ def test_package_api_index_packages_endpoint(self):
]
self.assertEqual(expected_unsupported_packages, response.data['unsupported_packages'])

def test_package_api_filter_by_checksums(self):
sha1s = [
'testsha1',
'testsha1-2',
'testsha1-3',
]
data = {
'sha1': sha1s
}
response = self.client.post('/api/packages/filter_by_checksums/', data=data)
self.assertEqual(3, response.data['count'])
expected = self.get_test_loc('api/package-filter_by_checksums-expected.json')
self.check_expected_results(response.data['results'], expected, fields_to_remove=["url", "uuid", "resources"], regen=False)


class PackageApiReindexingTestCase(JsonBasedTesting, TestCase):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles')
Expand Down
Loading