diff --git a/packagedb/api.py b/packagedb/api.py index a55a2c2a..cc47c291 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -8,49 +8,37 @@ # import logging + +import django_filters from django.core.exceptions import ValidationError -from django.db.models import OuterRef -from django.db.models import Q -from django.db.models import Subquery +from django.db.models import OuterRef, Q, Subquery +from django_filters.filters import Filter, OrderingFilter from django_filters.rest_framework import FilterSet -from django_filters.filters import Filter -from django_filters.filters import OrderingFilter -import django_filters - from packageurl import PackageURL from packageurl.contrib.django.utils import purl_to_lookups -from rest_framework import status -from rest_framework import viewsets +from rest_framework import status, viewsets from rest_framework.decorators import action from rest_framework.response import Response +from univers.version_constraint import InvalidConstraintsError +from univers.version_range import RANGE_CLASS_BY_SCHEMES, VersionRange +from univers.versions import InvalidVersion -from matchcode.api import MultipleCharFilter -from matchcode.api import MultipleCharInFilter +from matchcode.api import MultipleCharFilter, MultipleCharInFilter # UnusedImport here! # But importing the mappers and visitors module triggers routes registration from minecode import visitors # NOQA from minecode import priority_router -from minecode.models import PriorityResourceURI -from minecode.models import ScannableURI +from minecode.models import PriorityResourceURI, ScannableURI from minecode.route import NoRouteAvailable -from packagedb.models import Package -from packagedb.models import PackageContentType -from packagedb.models import PackageSet -from packagedb.models import Resource -from packagedb.serializers import DependentPackageSerializer -from packagedb.serializers import ResourceAPISerializer -from packagedb.serializers import PackageAPISerializer -from packagedb.serializers import PackageSetAPISerializer -from packagedb.serializers import PartySerializer -from packagedb.package_managers import get_api_package_name -from packagedb.package_managers import get_version_fetcher -from packagedb.package_managers import VERSION_API_CLASSES_BY_PACKAGE_TYPE - -from univers import versions -from univers.version_range import RANGE_CLASS_BY_SCHEMES -from univers.versions import InvalidVersion -from univers.version_range import VersionRange -from univers.version_constraint import InvalidConstraintsError +from packagedb.filters import PackageSearchFilter +from packagedb.models import Package, PackageContentType, PackageSet, Resource +from packagedb.package_managers import (VERSION_API_CLASSES_BY_PACKAGE_TYPE, + get_api_package_name, + get_version_fetcher) +from packagedb.serializers import (DependentPackageSerializer, + PackageAPISerializer, + PackageSetAPISerializer, PartySerializer, + ResourceAPISerializer) logger = logging.getLogger(__name__) @@ -84,21 +72,21 @@ def filter(self, qs, value): return qs.filter(package=package) -class ResourceFilter(FilterSet): +class ResourceFilterSet(FilterSet): package = PackageResourceUUIDFilter(label='Package UUID') purl = PackageResourcePurlFilter(label='Package pURL') md5 = MultipleCharInFilter( - help_text="Exact MD5. Multi-value supported.", + help_text='Exact MD5. Multi-value supported.', ) sha1 = MultipleCharInFilter( - help_text="Exact SHA1. Multi-value supported.", + help_text='Exact SHA1. Multi-value supported.', ) class ResourceViewSet(viewsets.ReadOnlyModelViewSet): queryset = Resource.objects.select_related('package') serializer_class = ResourceAPISerializer - filterset_class = ResourceFilter + filterset_class = ResourceFilterSet lookup_field = 'sha1' @action(detail=False, methods=['post']) @@ -169,70 +157,63 @@ def filter_by_checksums(self, request, *args, **kwargs): return self.get_paginated_response(serializer.data) -class MultiplePackageURLFilter(Filter): +class MultiplePackageURLFilter(MultipleCharFilter): def filter(self, qs, value): - try: - request = self.parent.request - except AttributeError: - return None + if not value: + # Even though not a noop, no point filtering if empty. + return qs - values = request.GET.getlist(self.field_name) - if all(v == '' for v in values): + if self.is_noop(qs, value): return qs - values = {item for item in values} + if all(v == '' for v in value): + return qs q = Q() - for val in values: + for val in value: lookups = purl_to_lookups(val) if not lookups: continue - q.add(Q(**lookups), Q.OR) - if not q: - return qs.none() - - return qs.filter(q) - - -class PackageSearchFilter(Filter): - def filter(self, qs, value): - try: - request = self.parent.request - except AttributeError: - return None - - if not value: - return qs + if q: + qs = self.get_method(qs)(q) + else: + qs = qs.none() - return Package.objects.filter(search_vector=value) + return qs.distinct() if self.distinct else qs -class PackageFilter(FilterSet): +class PackageFilterSet(FilterSet): type = django_filters.CharFilter( - lookup_expr="iexact", - help_text="Exact type. (case-insensitive)", + lookup_expr='iexact', + help_text='Exact type. (case-insensitive)', ) namespace = django_filters.CharFilter( - lookup_expr="iexact", - help_text="Exact namespace. (case-insensitive)", + lookup_expr='iexact', + help_text='Exact namespace. (case-insensitive)', ) name = MultipleCharFilter( - lookup_expr="iexact", - help_text="Exact name. Multi-value supported. (case-insensitive)", + lookup_expr='iexact', + help_text='Exact name. Multi-value supported. (case-insensitive)', ) version = MultipleCharFilter( - help_text="Exact version. Multi-value supported.", + help_text='Exact version. Multi-value supported.', ) md5 = MultipleCharInFilter( - help_text="Exact MD5. Multi-value supported.", + help_text='Exact MD5. Multi-value supported.', ) sha1 = MultipleCharInFilter( - help_text="Exact SHA1. Multi-value supported.", + help_text='Exact SHA1. Multi-value supported.', + ) + purl = MultiplePackageURLFilter( + label='Package URL', + ) + search = PackageSearchFilter( + label='Search', + field_name='name', + lookup_expr='icontains', ) - purl = MultiplePackageURLFilter(label='Package URL') - search = PackageSearchFilter(label='Search') sort = OrderingFilter(fields=[ 'type', @@ -250,6 +231,7 @@ class PackageFilter(FilterSet): class Meta: model = Package fields = ( + 'search', 'type', 'namespace', 'name', @@ -270,7 +252,7 @@ class PackageViewSet(viewsets.ReadOnlyModelViewSet): queryset = Package.objects.prefetch_related('dependencies', 'parties') serializer_class = PackageAPISerializer lookup_field = 'uuid' - filterset_class = PackageFilter + filterset_class = PackageFilterSet @action(detail=True, methods=['get']) def latest_version(self, request, *args, **kwargs): @@ -429,7 +411,7 @@ def index_packages(self, request, *args, **kwargs): packages = request.data.get('packages') or [] queued_packages = [] unqueued_packages = [] - supported_ecosystems = ["maven", "npm"] + supported_ecosystems = ['maven', 'npm'] unique_purls, unsupported_packages, unsupported_vers = get_resolved_purls(packages, supported_ecosystems) diff --git a/packagedb/filters.py b/packagedb/filters.py new file mode 100644 index 00000000..9c38a0f9 --- /dev/null +++ b/packagedb/filters.py @@ -0,0 +1,92 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import shlex + +import django_filters +from django.core.exceptions import FieldError +from django.db.models import Q + +# The function and Classes in this file are from https://github.com/nexB/scancode.io/blob/main/scanpipe/filters.py + + +def parse_query_string_to_lookups(query_string, default_lookup_expr, default_field): + """Parse a query string and convert it into queryset lookups using Q objects.""" + lookups = Q() + terms = shlex.split(query_string) + + lookup_types = { + "=": "iexact", + "^": "istartswith", + "$": "iendswith", + "~": "icontains", + ">": "gt", + "<": "lt", + } + + for term in terms: + lookup_expr = default_lookup_expr + negated = False + + if ":" in term: + field_name, search_value = term.split(":", maxsplit=1) + if field_name.endswith(tuple(lookup_types.keys())): + lookup_symbol = field_name[-1] + lookup_expr = lookup_types.get(lookup_symbol) + field_name = field_name[:-1] + + if field_name.startswith("-"): + field_name = field_name[1:] + negated = True + + else: + search_value = term + field_name = default_field + + lookups &= Q(**{f"{field_name}__{lookup_expr}": search_value}, _negated=negated) + + return lookups + + +class QuerySearchFilter(django_filters.CharFilter): + """Add support for complex query syntax in search filter.""" + + def filter(self, qs, value): + if not value: + return qs + + lookups = parse_query_string_to_lookups( + query_string=value, + default_lookup_expr=self.lookup_expr, + default_field=self.field_name, + ) + + try: + return qs.filter(lookups) + except FieldError: + return qs.none() + + +class PackageSearchFilter(QuerySearchFilter): + def filter(self, qs, value): + if not value: + return qs + + if value.startswith("pkg:"): + return qs.for_package_url(value) + + if "://" not in value and ":" in value: + return super().filter(qs, value) + + search_fields = ["type", "namespace", "name", "version", "download_url"] + lookups = Q() + for field_names in search_fields: + lookups |= Q(**{f"{field_names}__{self.lookup_expr}": value}) + + return qs.filter(lookups) diff --git a/packagedb/migrations/0080_remove_package_packagedb_p_search__8d33bb_gin_and_more.py b/packagedb/migrations/0080_remove_package_packagedb_p_search__8d33bb_gin_and_more.py new file mode 100644 index 00000000..e4d99250 --- /dev/null +++ b/packagedb/migrations/0080_remove_package_packagedb_p_search__8d33bb_gin_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 4.1.2 on 2023-11-07 00:32 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("packagedb", "0079_alter_package_name_alter_package_namespace_and_more"), + ] + + operations = [ + migrations.RemoveIndex( + model_name="package", + name="packagedb_p_search__8d33bb_gin", + ), + migrations.RemoveField( + model_name="package", + name="search_vector", + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index 4de36475..3e9a8b1c 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -531,8 +531,6 @@ class Package( ), ) - search_vector = SearchVectorField(null=True) - objects = PackageQuerySet.as_manager() # TODO: Think about ordering, unique together, indexes, etc. @@ -550,8 +548,6 @@ class Meta: ) ] indexes = [ - # GIN index for search performance increase - GinIndex(fields=['search_vector']), # multicolumn index for search on a whole `purl` models.Index(fields=[ 'type', 'namespace', 'name', 'version', 'qualifiers', 'subpath' diff --git a/packagedb/signals.py b/packagedb/signals.py deleted file mode 100644 index b8f375f2..00000000 --- a/packagedb/signals.py +++ /dev/null @@ -1,21 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from django.contrib.postgres.search import SearchVector -from django.db.models.signals import post_save -from django.dispatch import receiver - -from packagedb.models import Package - - -@receiver(post_save, sender=Package) -def update_search_vector(sender, instance, **kwargs): - Package.objects.filter(pk=instance.pk).update( - search_vector=SearchVector('namespace', 'name', 'version', 'download_url') - ) diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 357e2ab4..b2ad0b3b 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -375,12 +375,6 @@ def test_package_api_list_endpoint_filter_by_purl_fields_ignores_case(self): self.assertEqual(1, response.data.get('count')) def test_package_api_list_endpoint_search(self): - # Populate the search vector field. This is done via Django signals - # outside of the tests. - Package.objects.filter(uuid=self.package.uuid).update( - search_vector=SearchVector('namespace', 'name', 'version', 'download_url') - ) - # Create a dummy package to verify search filter works. Package.objects.create( type='generic', @@ -390,14 +384,16 @@ def test_package_api_list_endpoint_search(self): download_url='https://dummy.com/dummy' ) - for key, value in self.package_data.items(): - # Skip since we only search on one field - if key not in ['namespace', 'name', 'version', 'download_url']: - continue - - response = self.client.get('/api/packages/?search={}'.format(value)) - assert response.status_code == status.HTTP_200_OK - assert response.data.get('count') == 1 + response = self.client.get('/api/packages/?search={}'.format('generic')) + assert response.data.get('count') == 2 + response = self.client.get('/api/packages/?search={}'.format('dummy')) + assert response.data.get('count') == 1 + response = self.client.get('/api/packages/?search={}'.format('DUMMY')) + assert response.data.get('count') == 1 + response = self.client.get('/api/packages/?search={}'.format('12.35')) + assert response.data.get('count') == 1 + response = self.client.get('/api/packages/?search={}'.format('https://dummy.com/dummy')) + assert response.data.get('count') == 1 def test_package_api_retrieve_endpoint(self): response = self.client.get('/api/packages/{}/'.format(self.package.uuid)) diff --git a/packagedb/tests/test_filters.py b/packagedb/tests/test_filters.py new file mode 100644 index 00000000..358ead31 --- /dev/null +++ b/packagedb/tests/test_filters.py @@ -0,0 +1,84 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from django.test import TestCase +from packagedb.api import PackageFilterSet +from packagedb.filters import parse_query_string_to_lookups +from packagedb.models import Package + + +class PackageDBFilterTest(TestCase): + + def test_scanpipe_filters_package_filterset_search(self): + p1 = Package.objects.create( + type='maven', + namespace='org.example', + name='foo', + version='1.0.0', + download_url='https://example.com/foo-1.0.0.jar', + ) + p2 = Package.objects.create( + type='maven', + namespace='org.somethingelse', + name='foo', + version='0.35.7', + download_url='https://somethingelse.net/foo-0.35.7.jar', + ) + + filterset = PackageFilterSet(data={}) + self.assertEqual(2, len(filterset.qs)) + + filterset = PackageFilterSet(data={"search": p1.purl}) + self.assertEqual([p1], list(filterset.qs)) + + filterset = PackageFilterSet(data={"search": p1.version}) + self.assertEqual([p1], list(filterset.qs)) + + filterset = PackageFilterSet(data={"search": p1.name}) + self.assertEqual(2, len(filterset.qs)) + + filterset = PackageFilterSet(data={"search": p1.type}) + self.assertEqual(2, len(filterset.qs)) + + def test_packagedb_filters_parse_query_string_to_lookups(self): + inputs = { + "LICENSE": "(AND: ('name__icontains', 'LICENSE'))", + "two words": ( + "(AND: ('name__icontains', 'two'), ('name__icontains', 'words'))" + ), + "'two words'": "(AND: ('name__icontains', 'two words'))", + "na me:LICENSE": ( + "(AND: ('name__icontains', 'na'), ('me__icontains', 'LICENSE'))" + ), + "name:LICENSE": "(AND: ('name__icontains', 'LICENSE'))", + "default_value name:LICENSE": ( + "(AND: ('name__icontains', 'default_value'), " + "('name__icontains', 'LICENSE'))" + ), + 'name:"name with spaces"': "(AND: ('name__icontains', 'name with spaces'))", + "name:'name with spaces'": "(AND: ('name__icontains', 'name with spaces'))", + "-name:LICENSE -name:NOTICE": ( + "(AND: (NOT (AND: ('name__icontains', 'LICENSE'))), " + "(NOT (AND: ('name__icontains', 'NOTICE'))))" + ), + "name:LICENSE status:scanned": ( + "(AND: ('name__icontains', 'LICENSE'), " + "('status__icontains', 'scanned'))" + ), + 'name^:"file"': "(AND: ('name__istartswith', 'file'))", + 'name$:".zip"': "(AND: ('name__iendswith', '.zip'))", + 'name=:"LICENSE"': "(AND: ('name__iexact', 'LICENSE'))", + 'name~:"LIC"': "(AND: ('name__icontains', 'LIC'))", + 'count<:"100"': "(AND: ('count__lt', '100'))", + 'count>:"10"': "(AND: ('count__gt', '10'))", + } + + for query_string, expected in inputs.items(): + lookups = parse_query_string_to_lookups(query_string, "icontains", "name") + self.assertEqual(expected, str(lookups))