From 13d02121a0ad308fd696e8f1d88dc2833978616a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 5 Feb 2024 17:37:32 -0800 Subject: [PATCH 01/31] Move maven priority queue related code to minecode.collectors #283 Signed-off-by: Jono Yang --- minecode/collectors/__init__.py | 0 minecode/collectors/maven.py | 694 +++++++++++++++++ .../commands/get_maven_release_dates.py | 4 +- minecode/management/commands/import_queue.py | 13 +- minecode/management/commands/maven_crawler.py | 2 +- minecode/tests/test_maven.py | 89 +-- minecode/visitors/maven.py | 697 ------------------ packagedb/find_source_repo.py | 4 +- .../commands/create_source_repo_packages.py | 7 +- 9 files changed, 753 insertions(+), 757 deletions(-) create mode 100644 minecode/collectors/__init__.py create mode 100644 minecode/collectors/maven.py diff --git a/minecode/collectors/__init__.py b/minecode/collectors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py new file mode 100644 index 00000000..ceb0c17a --- /dev/null +++ b/minecode/collectors/maven.py @@ -0,0 +1,694 @@ +import hashlib +import logging +import re +from typing import Dict +from urllib.parse import urlparse + +import requests +from packagedcode.maven import _parse, get_maven_pom, get_urls +from packageurl import PackageURL + +from minecode import priority_router +from minecode.visitors.maven import MAVEN_BASE_URL +from packagedb.models import PackageContentType, PackageRelation, make_relationship + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRACE = False +TRACE_DEEP = False + +if TRACE: + logger.setLevel(logging.DEBUG) + + +def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): + """ + Return the contents of the POM file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + if qualifiers and not isinstance(qualifiers, Dict): + return + urls = get_urls( + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + base_url=base_url, + ) + # Get and parse POM info + pom_url = urls["api_data_url"] + # TODO: manage different types of errors (404, etc.) + response = requests.get(pom_url) + if not response: + return + return response.text + + +def get_package_sha1(package): + """ + Return the sha1 value for `package` by checking if the sha1 file exists for + `package` on maven and returning the contents if it does. + + If the sha1 is invalid, we download the package's JAR and calculate the sha1 + from that. + """ + download_url = package.repository_download_url + sha1_download_url = f"{download_url}.sha1" + response = requests.get(sha1_download_url) + if response.ok: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + if not sha1: + # Download JAR and calculate sha1 if we cannot get it from the repo + response = requests.get(download_url) + if response: + sha1_hash = hashlib.new("sha1", response.content) + sha1 = sha1_hash.hexdigest() + return sha1 + + +def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): + """ + Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. + """ + if not pom_text: + return + pom = get_maven_pom(text=pom_text) + if ( + pom.parent + and pom.parent.group_id + and pom.parent.artifact_id + and pom.parent.version.version + ): + parent_namespace = pom.parent.group_id + parent_name = pom.parent.artifact_id + parent_version = str(pom.parent.version.version) + parent_pom_text = get_pom_text( + namespace=parent_namespace, + name=parent_name, + version=parent_version, + qualifiers={}, + base_url=base_url, + ) + return parent_pom_text + + +def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): + """ + Return a list of pom text of the ancestors of `pom`. The list is ordered + from oldest ancestor to newest. The list is empty is there is no parent pom. + """ + ancestors = [] + has_parent = True + while has_parent: + parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) + if not parent_pom_text: + has_parent = False + else: + ancestors.append(parent_pom_text) + pom_text = parent_pom_text + return reversed(ancestors) + + +def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): + """ + Merge package details of a package with its ancestor pom + and return the merged package. + """ + if not package: + return + pom_text = get_pom_text( + name=package.name, + namespace=package.namespace, + version=package.version, + qualifiers=package.qualifiers, + base_url=base_url, + ) + merged_package = merge_ancestors( + ancestor_pom_texts=get_ancestry(pom_text), + package=package, + ) + return merged_package + + +def merge_parent(package, parent_package): + """ + Merge `parent_package` data into `package` and return `package. + """ + mergeable_fields = ( + "declared_license_expression", + "homepage_url", + "parties", + ) + for field in mergeable_fields: + # If `field` is empty on the package we're looking at, populate + # those fields with values from the parent package. + if not getattr(package, field): + value = getattr(parent_package, field) + setattr(package, field, value) + + msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" + history = package.extra_data.get("history") + if history: + package.extra_data["history"].append(msg) + else: + package.extra_data["history"] = [msg] + + return package + + +def merge_ancestors(ancestor_pom_texts, package): + """ + Merge metadata from `ancestor_pom_text` into `package`. + + The order of POM content in `ancestor_pom_texts` is expected to be in the + order of oldest ancestor to newest. + """ + for ancestor_pom_text in ancestor_pom_texts: + ancestor_package = _parse( + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", + text=ancestor_pom_text, + ) + package = merge_parent(package, ancestor_package) + return package + + +def map_maven_package(package_url, package_content): + """ + Add a maven `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + """ + from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + + db_package = None + error = "" + + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + else: + base_url = MAVEN_BASE_URL + + pom_text = get_pom_text( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + if not pom_text: + msg = f"Package does not exist on maven: {package_url}" + error += msg + "\n" + logger.error(msg) + return db_package, error + + package = _parse( + "maven_pom", + "maven", + "Java", + text=pom_text, + base_url=base_url, + ) + ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) + package = merge_ancestors(ancestor_pom_texts=ancestor_pom_texts, package=package) + + urls = get_urls( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + # In the case of looking up a maven package with qualifiers of + # `classifiers=sources`, the purl of the package created from the pom does + # not have the qualifiers, so we need to set them. Additionally, the download + # url is not properly generated since it would be missing the sources bit + # from the filename. + package.qualifiers = package_url.qualifiers + package.download_url = urls["repository_download_url"] + package.repository_download_url = urls["repository_download_url"] + + # Set package_content value + package.extra_data["package_content"] = package_content + + # If sha1 exists for a jar, we know we can create the package + # Use pom info as base and create packages for binary and source package + + # Check to see if binary is available + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f"Failed to retrieve JAR: {package_url}" + error += msg + "\n" + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return db_package, error + + +def validate_sha1(sha1): + """ + Validate a `sha1` string. + + Return `sha1` if it is valid, None otherwise. + """ + if sha1 and len(sha1) != 40: + logger.warning(f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!') + sha1 = None + return sha1 + + +def map_maven_binary_and_source(package_url): + """ + Get metadata for the binary and source release of the Maven package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + package, emsg = map_maven_package(package_url, PackageContentType.BINARY) + if emsg: + error += emsg + + source_package_url = package_url + source_package_url.qualifiers["classifier"] = "sources" + source_package, emsg = map_maven_package( + source_package_url, PackageContentType.SOURCE_ARCHIVE + ) + if emsg: + error += emsg + + if package and source_package: + make_relationship( + from_package=source_package, + to_package=package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +def map_maven_packages(package_url): + """ + Given a valid `package_url` with no version, get metadata for the binary and + source release for each version of the Maven package `package_url` and save + it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + namespace = package_url.namespace + name = package_url.name + # Find all versions of this package + query_params = f"g:{namespace}+AND+a:{name}" + url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" + response = requests.get(url) + if response: + package_listings = response.json().get("response", {}).get("docs", []) + for listing in package_listings: + purl = PackageURL( + type="maven", + namespace=listing.get("g"), + name=listing.get("a"), + version=listing.get("v"), + ) + emsg = map_maven_binary_and_source(purl) + if emsg: + error += emsg + return error + + +@priority_router.route("pkg:maven/.*") +def process_request(purl_str): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from maven and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occured when parsing {purl_str}: {e}" + return error + + has_version = bool(package_url.version) + if has_version: + error = map_maven_binary_and_source(package_url) + else: + error = map_maven_packages(package_url) + + return error + + +collect_links = re.compile(r'href="([^"]+)"').findall +collect_links_and_artifact_timestamps = re.compile( + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """ + Return True if `file_name` is in `links` + """ + return any(l.endswith(file_name) for l in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """ + Return True of any entry in `links` ends with .pom. + """ + return any(l.endswith(".pom") for l in links) + + +def check_if_page_has_directories(links, **kwargs): + """ + Return True if any entry, excluding "../", ends with /. + """ + return any(l.endswith("/") for l in links if l != "../") + + +def check_if_package_version_page(links, **kwargs): + """ + Return True if `links` contains pom files and has no directories + """ + return check_if_page_has_pom_files( + links=links + ) and not check_if_page_has_directories(links=links) + + +def check_if_package_page(links, **kwargs): + return check_if_file_name_is_linked_on_page( + file_name="maven-metadata.xml", links=links + ) and not check_if_page_has_pom_files(links=links) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page( + file_name="archetype-catalog.xml", links=links + ) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """ + Return True if `url` is the root of a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """ + Return True if `url` is a package page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """ + Return True if `url` is a package version page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split("/") if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f"{scheme}://{netloc}" + path = "/".join(path_segments) + return f"{url_template}/{path}" + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[: i + 1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url, root_url=None): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + if not root_url: + root_url = get_maven_root(url) + if not root_url: + raise Exception(f"Error: not a Maven repository: {url}") + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split("/") + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = "" + package_version = "" + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[: i + 1] + path = "/".join(segments) + url_segment = f"{root_url}/{path}" + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = ".".join(namespace_segments) + return namespace, package_name, package_version + + +def add_to_import_queue(url, root_url): + """ + Create ImportableURI for the Maven repo package page at `url`. + """ + from minecode.models import ImportableURI + + data = None + response = requests.get(url) + if response: + data = response.text + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type="maven", + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) + if importable_uri: + logger.info(f"Inserted {url} into ImportableURI queue") + + +def filter_only_directories(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != "../" and link.endswith("/"): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +valid_artifact_extensions = [ + "ejb3", + "ear", + "aar", + "apk", + "gem", + "jar", + "nar", + # 'pom', + "so", + "swc", + "tar", + "tar.gz", + "war", + "xar", + "zip", +] + + +def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +def collect_links_from_text(text, filter): + """ + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. + """ + links_and_timestamps = collect_links_and_artifact_timestamps(text) + timestamps_by_links = {} + for link, timestamp in links_and_timestamps: + if timestamp == "-": + timestamp = "" + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip("/") + timestamps_by_links = collect_links_from_text(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f"{url}/{link}" + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + +def get_directory_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_only_directories + ) + return timestamps_by_directory_links + + +def get_artifact_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_for_artifacts + ) + return timestamps_by_artifact_links + + +def crawl_to_package(url, root_url): + """ + Given a maven repo `url`, + """ + if is_package_page(url): + add_to_import_queue(url, root_url) + return + + for link in get_directory_links(url): + crawl_to_package(link, root_url) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url, root_url) + + +def get_artifact_sha1(artifact_url): + """ + Return the SHA1 value of the Maven artifact located at `artifact_url`. + """ + sha1 = None + artifact_sha1_url = f"{artifact_url}.sha1" + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url( + artifact_url, package_version_page_url, package_name, package_version +): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip("/") + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split(".") + if remaining_url_portions and remaining_url_portions[0]: + # '-onejar' + classifier = remaining_url_portions[0] + if classifier.startswith("-"): + # 'onejar' + classifier = classifier[1:] + return classifier diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index c120b67e..dda3d868 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -15,8 +15,8 @@ import requests from minecode.management.commands import VerboseCommand -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_for_artifacts +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_for_artifacts from packagedb.models import Package diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 55862921..178eaa22 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -22,15 +22,14 @@ from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.visitors.maven import get_artifact_links -from minecode.visitors.maven import get_classifier_from_artifact_url -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_only_directories -from minecode.visitors.maven import get_artifact_sha1 +from minecode.collectors.maven import get_artifact_links +from minecode.collectors.maven import get_classifier_from_artifact_url +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_only_directories +from minecode.collectors.maven import get_artifact_sha1 from minecode.model_utils import merge_or_create_package from packagedcode.models import PackageData -from packagedb.models import Package -from minecode.visitors.maven import determine_namespace_name_version_from_url +from minecode.collectors.maven import determine_namespace_name_version_from_url logger = logging.getLogger(__name__) diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index 30c8f360..ee68b163 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -10,8 +10,8 @@ import logging import sys +from minecode.collectors.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand -from minecode.visitors.maven import crawl_maven_repo_from_root logger = logging.getLogger(__name__) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index 2375f20b..5a93a601 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -17,6 +17,7 @@ from django.test import TestCase as DjangoTestCase +from minecode.collectors import maven as maven_collector from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri from minecode.mappers import maven as maven_mapper @@ -700,7 +701,7 @@ def setUp(self): ) def test_get_pom_text(self, regen=False): - pom_contents = maven_visitor.get_pom_text( + pom_contents = maven_collector.get_pom_text( namespace=self.scan_package.namespace, name=self.scan_package.name, version=self.scan_package.version @@ -711,7 +712,7 @@ def test_get_pom_text(self, regen=False): self.assertEqual(self.expected_pom_contents, pom_contents) def test_get_package_sha1(self): - sha1 = maven_visitor.get_package_sha1(self.scan_package) + sha1 = maven_collector.get_package_sha1(self.scan_package) expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' self.assertEqual(expected_sha1, sha1) @@ -719,7 +720,7 @@ def test_map_maven_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - maven_visitor.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) + maven_collector.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() @@ -731,7 +732,7 @@ def test_map_maven_package_custom_repo_url(self): self.assertEqual(0, package_count) custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" package_url = PackageURL.from_string(custom_repo_purl) - maven_visitor.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) + maven_collector.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() @@ -746,7 +747,7 @@ def test_process_request(self): sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) - maven_visitor.process_request(purl_str) + maven_collector.process_request(purl_str) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(2, package_count) purls = [ @@ -764,7 +765,7 @@ def test_fetch_parent(self, regen=False): pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') with open(pom_loc) as f: pom_text = f.read() - parent_pom_text = maven_visitor.fetch_parent(pom_text) + parent_pom_text = maven_collector.fetch_parent(pom_text) expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') if regen: @@ -779,7 +780,7 @@ def test_get_ancestry(self): pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') with open(pom_loc) as f: pom_text = f.read() - ancestor_pom_texts = list(maven_visitor.get_ancestry(pom_text)) + ancestor_pom_texts = list(maven_collector.get_ancestry(pom_text)) expected_ancestor_pom_texts = [] for expected_loc in [ self.get_test_loc('maven/pom/apache-18.pom'), @@ -813,7 +814,7 @@ def test_merge_parent(self, regen=False): 'Java', text=parent_pom_text ) - package = maven_visitor.merge_parent(package, parent_package) + package = maven_collector.merge_parent(package, parent_package) expected_after_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_after.json') self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) @@ -840,15 +841,15 @@ def test_merge_ancestors(self, regen=False): pom_text = f.read() ancestor_pom_texts.append(pom_text) - maven_visitor.merge_ancestors(ancestor_pom_texts, package) + maven_collector.merge_ancestors(ancestor_pom_texts, package) expected_after_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_after.json') self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) - @mock.patch("minecode.visitors.maven.get_pom_text") + @mock.patch("minecode.collectors.maven.get_pom_text") def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=False): get_pom_text_mock.return_value = "" ancestor_pom_texts = [] - with patch("minecode.visitors.maven.get_ancestry") as mock_get_ancestry: + with patch("minecode.collectors.maven.get_ancestry") as mock_get_ancestry: for loc in [ self.get_test_loc('maven/pom/apache-18.pom'), self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), @@ -865,7 +866,7 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, type="maven", download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", ) - merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package(package=db_package) + merged_package = maven_collector.get_merged_ancestor_package_from_maven_package(package=db_package) expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) @@ -876,60 +877,60 @@ class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): def test_check_if_file_name_is_linked_on_page(self): links = ['foo/', 'bar/', 'baz/'] self.assertTrue( - maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) + maven_collector.check_if_file_name_is_linked_on_page('foo/', links) ) self.assertFalse( - maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) + maven_collector.check_if_file_name_is_linked_on_page('qux/', links) ) def test_check_if_page_has_pom_files(self): links1 = ['foo/', 'bar.jar', 'bar.pom'] links2 = ['foo/', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) - self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) + self.assertTrue(maven_collector.check_if_page_has_pom_files(links1)) + self.assertFalse(maven_collector.check_if_page_has_pom_files(links2)) def test_check_if_page_has_directories(self): links1 = ['foo/', 'bar/', 'baz/'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) - self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) + self.assertTrue(maven_collector.check_if_page_has_directories(links1)) + self.assertFalse(maven_collector.check_if_page_has_directories(links2)) def test_check_if_package_version_page(self): links1 = ['../', 'bar.pom', 'bar.jar'] links2 = ['../', 'foo/', 'bar/', 'baz/'] - self.assertTrue(maven_visitor.check_if_package_version_page(links1)) - self.assertFalse(maven_visitor.check_if_package_version_page(links2)) + self.assertTrue(maven_collector.check_if_package_version_page(links1)) + self.assertFalse(maven_collector.check_if_package_version_page(links2)) def test_check_if_package_page(self): links1 = ['../', 'maven-metadata.xml'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_package_page(links1)) - self.assertFalse(maven_visitor.check_if_package_page(links2)) + self.assertTrue(maven_collector.check_if_package_page(links1)) + self.assertFalse(maven_collector.check_if_package_page(links2)) def test_check_if_maven_root(self): links1 = ['../', 'archetype-catalog.xml'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_maven_root(links1)) - self.assertFalse(maven_visitor.check_if_maven_root(links2)) + self.assertTrue(maven_collector.check_if_maven_root(links1)) + self.assertFalse(maven_collector.check_if_maven_root(links2)) @mock.patch('requests.get') def test_check_on_page(self, mock_request_get): - checker = maven_visitor.check_if_page_has_pom_files + checker = maven_collector.check_if_page_has_pom_files mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'parent-7.11.0.pom' - self.assertTrue(maven_visitor.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + self.assertTrue(maven_collector.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) @mock.patch('requests.get') def test_is_maven_root(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertTrue(maven_visitor.is_maven_root('https://repo1.maven.org/maven2/')) + self.assertTrue(maven_collector.is_maven_root('https://repo1.maven.org/maven2/')) @mock.patch('requests.get') def test_is_package_page(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'maven-metadata.xml' - self.assertTrue(maven_visitor.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + self.assertTrue(maven_collector.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) @mock.patch('requests.get') def test_is_package_version_page(self, mock_request_get): @@ -938,11 +939,11 @@ def test_is_package_version_page(self, mock_request_get): ../ parent-7.11.0.pom ''' - self.assertTrue(maven_visitor.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + self.assertTrue(maven_collector.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) def test_url_parts(self): url = 'https://example.com/foo/bar/baz.jar' - scheme, netloc, path_segments = maven_visitor.url_parts(url) + scheme, netloc, path_segments = maven_collector.url_parts(url) self.assertEqual('https', scheme) self.assertEqual('example.com', netloc) self.assertEquals(['foo', 'bar', 'baz.jar'], path_segments) @@ -954,7 +955,7 @@ def test_create_url(self): url = 'https://example.com/foo/bar/baz.jar' self.assertEqual( url, - maven_visitor.create_url(scheme, netloc, path_segments) + maven_collector.create_url(scheme, netloc, path_segments) ) @mock.patch('requests.get') @@ -963,7 +964,7 @@ def test_get_maven_root(self, mock_request_get): mock_request_get.return_value.text = 'archetype-catalog.xml' self.assertEqual( 'https://repo1.maven.org/maven2', - maven_visitor.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + maven_collector.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') ) @mock.patch('requests.get') @@ -993,7 +994,7 @@ def test_determine_namespace_name_version_from_url(self, mock_request_get): package_version_page ] - namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url(url, root_url) + namespace, package_name, package_version = maven_collector.determine_namespace_name_version_from_url(url, root_url) self.assertEqual('xml-apis', namespace) self.assertEqual('xml-apis', package_name) self.assertEqual('1.0.b2', package_version) @@ -1029,7 +1030,7 @@ def test_add_to_import_queue(self, mock_request_get): ] self.assertEqual(0, ImportableURI.objects.all().count()) - maven_visitor.add_to_import_queue(url, root_url ) + maven_collector.add_to_import_queue(url, root_url ) self.assertEqual(1, ImportableURI.objects.all().count()) importable_uri = ImportableURI.objects.get(uri=url) self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) @@ -1045,7 +1046,7 @@ def test_filter_only_directories(self): } self.assertEqual( expected, - maven_visitor.filter_only_directories(timestamps_by_links) + maven_collector.filter_only_directories(timestamps_by_links) ) def test_filter_for_artifacts(self): @@ -1083,10 +1084,10 @@ def test_filter_for_artifacts(self): 'foo.xar': '2023-09-28', 'foo.zip': '2023-09-28', } - self.assertEqual(expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) + self.assertEqual(expected, maven_collector.filter_for_artifacts(timestamps_by_links)) def test_collect_links_from_text(self): - filter = maven_visitor.filter_only_directories + filter = maven_collector.filter_only_directories text = ''' ../ 1.0.b2/ @@ -1100,11 +1101,11 @@ def test_collect_links_from_text(self): } self.assertEqual( expected, - maven_visitor.collect_links_from_text(text, filter=filter) + maven_collector.collect_links_from_text(text, filter=filter) ) def test_create_absolute_urls_for_links(self): - filter = maven_visitor.filter_only_directories + filter = maven_collector.filter_only_directories text = ''' ../ 1.0.b2/ @@ -1119,7 +1120,7 @@ def test_create_absolute_urls_for_links(self): } self.assertEqual( expected, - maven_visitor.create_absolute_urls_for_links(text, url, filter=filter) + maven_collector.create_absolute_urls_for_links(text, url, filter=filter) ) @mock.patch('requests.get') @@ -1137,7 +1138,7 @@ def test_get_directory_links(self, mock_request_get): 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' } - self.assertEqual(expected, maven_visitor.get_directory_links(url)) + self.assertEqual(expected, maven_collector.get_directory_links(url)) @mock.patch('requests.get') def test_get_artifact_links(self, mock_request_get): @@ -1153,7 +1154,7 @@ def test_get_artifact_links(self, mock_request_get): expected = { 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', } - self.assertEqual(expected, maven_visitor.get_artifact_links(url)) + self.assertEqual(expected, maven_collector.get_artifact_links(url)) def test_crawl_to_package(self): pass @@ -1166,14 +1167,14 @@ def test_get_artifact_sha1(self, mock_request_get): sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' mock_request_get.return_value.ok = True mock_request_get.return_value.text = sha1 - self.assertEqual(sha1, maven_visitor.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + self.assertEqual(sha1, maven_collector.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) def test_get_classifier_from_artifact_url(self): artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' package_name = 'livereload-jvm' package_version = '0.2.0' - classifier = maven_visitor.get_classifier_from_artifact_url( + classifier = maven_collector.get_classifier_from_artifact_url( artifact_url, package_version_page_url, package_name, diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 55624772..0be85a70 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -9,16 +9,11 @@ from collections import namedtuple import gzip -import hashlib import io import json import logging -import re -from typing import Dict -from urllib.parse import urlparse import arrow -import requests from bs4 import BeautifulSoup from dateutil import tz @@ -28,21 +23,13 @@ from packageurl import PackageURL from packagedcode.maven import build_filename from packagedcode.maven import build_url -from packagedcode.maven import get_urls -from packagedcode.maven import _parse -from packagedcode.maven import get_maven_pom -from packageurl import PackageURL from minecode import seed -from minecode import priority_router from minecode import visit_router from minecode.visitors import java_stream from minecode.visitors import HttpVisitor from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI -from packagedb.models import make_relationship -from packagedb.models import PackageContentType -from packagedb.models import PackageRelation """ This module handles the Maven repositories such as central and other @@ -109,690 +96,6 @@ def get_seeds(self): # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage -def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): - """ - Return the contents of the POM file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - if qualifiers and not isinstance(qualifiers, Dict): - return - urls = get_urls( - namespace=namespace, - name=name, - version=version, - qualifiers=qualifiers, - base_url=base_url, - ) - # Get and parse POM info - pom_url = urls['api_data_url'] - # TODO: manage different types of errors (404, etc.) - response = requests.get(pom_url) - if not response: - return - return response.text - - -def get_package_sha1(package): - """ - Return the sha1 value for `package` by checking if the sha1 file exists for - `package` on maven and returning the contents if it does. - - If the sha1 is invalid, we download the package's JAR and calculate the sha1 - from that. - """ - download_url = package.repository_download_url - sha1_download_url = f'{download_url}.sha1' - response = requests.get(sha1_download_url) - if response.ok: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - if not sha1: - # Download JAR and calculate sha1 if we cannot get it from the repo - response = requests.get(download_url) - if response: - sha1_hash = hashlib.new('sha1', response.content) - sha1 = sha1_hash.hexdigest() - return sha1 - - -def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): - """ - Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. - """ - if not pom_text: - return - pom = get_maven_pom(text=pom_text) - if ( - pom.parent - and pom.parent.group_id - and pom.parent.artifact_id - and pom.parent.version.version - ): - parent_namespace = pom.parent.group_id - parent_name = pom.parent.artifact_id - parent_version = str(pom.parent.version.version) - parent_pom_text = get_pom_text( - namespace=parent_namespace, - name=parent_name, - version=parent_version, - qualifiers={}, - base_url=base_url, - ) - return parent_pom_text - - -def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): - """ - Return a list of pom text of the ancestors of `pom`. The list is ordered - from oldest ancestor to newest. The list is empty is there is no parent pom. - """ - ancestors = [] - has_parent = True - while has_parent: - parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) - if not parent_pom_text: - has_parent = False - else: - ancestors.append(parent_pom_text) - pom_text = parent_pom_text - return reversed(ancestors) - - -def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): - """ - Merge package details of a package with its ancestor pom - and return the merged package. - """ - if not package: - return - pom_text = get_pom_text( - name=package.name, - namespace=package.namespace, - version=package.version, - qualifiers=package.qualifiers, - base_url=base_url, - ) - merged_package = merge_ancestors( - ancestor_pom_texts=get_ancestry(pom_text), - package=package, - ) - return merged_package - - -def merge_parent(package, parent_package): - """ - Merge `parent_package` data into `package` and return `package. - """ - mergeable_fields = ( - 'declared_license_expression', - 'homepage_url', - 'parties', - ) - for field in mergeable_fields: - # If `field` is empty on the package we're looking at, populate - # those fields with values from the parent package. - if not getattr(package, field): - value = getattr(parent_package, field) - setattr(package, field, value) - - msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' - history = package.extra_data.get('history') - if history: - package.extra_data['history'].append(msg) - else: - package.extra_data['history'] = [msg] - - return package - - -def merge_ancestors(ancestor_pom_texts, package): - """ - Merge metadata from `ancestor_pom_text` into `package`. - - The order of POM content in `ancestor_pom_texts` is expected to be in the - order of oldest ancestor to newest. - """ - for ancestor_pom_text in ancestor_pom_texts: - ancestor_package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=ancestor_pom_text - ) - package = merge_parent(package, ancestor_package) - return package - - -def map_maven_package(package_url, package_content): - """ - Add a maven `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - db_package = None - error = '' - - if "repository_url" in package_url.qualifiers: - base_url = package_url.qualifiers["repository_url"] - else: - base_url = MAVEN_BASE_URL - - pom_text = get_pom_text( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - if not pom_text: - msg = f'Package does not exist on maven: {package_url}' - error += msg + '\n' - logger.error(msg) - return db_package, error - - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text, - base_url=base_url, - ) - ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) - package = merge_ancestors( - ancestor_pom_texts=ancestor_pom_texts, - package=package - ) - - - urls = get_urls( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - # In the case of looking up a maven package with qualifiers of - # `classifiers=sources`, the purl of the package created from the pom does - # not have the qualifiers, so we need to set them. Additionally, the download - # url is not properly generated since it would be missing the sources bit - # from the filename. - package.qualifiers = package_url.qualifiers - package.download_url = urls['repository_download_url'] - package.repository_download_url = urls['repository_download_url'] - - # Set package_content value - package.extra_data['package_content'] = package_content - - # If sha1 exists for a jar, we know we can create the package - # Use pom info as base and create packages for binary and source package - - # Check to see if binary is available - sha1 = get_package_sha1(package) - if sha1: - package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) - else: - msg = f'Failed to retrieve JAR: {package_url}' - error += msg + '\n' - logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package) - - return db_package, error - - -def validate_sha1(sha1): - """ - Validate a `sha1` string. - - Return `sha1` if it is valid, None otherwise. - """ - if sha1 and len(sha1) != 40: - logger.warning( - f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!' - ) - sha1 = None - return sha1 - - -def map_maven_binary_and_source(package_url): - """ - Get metadata for the binary and source release of the Maven package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - package, emsg = map_maven_package( - package_url, - PackageContentType.BINARY - ) - if emsg: - error += emsg - - source_package_url = package_url - source_package_url.qualifiers['classifier'] = 'sources' - source_package, emsg = map_maven_package( - source_package_url, - PackageContentType.SOURCE_ARCHIVE - ) - if emsg: - error += emsg - - if package and source_package: - make_relationship( - from_package=source_package, - to_package=package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE - ) - - return error - - -def map_maven_packages(package_url): - """ - Given a valid `package_url` with no version, get metadata for the binary and - source release for each version of the Maven package `package_url` and save - it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - namespace = package_url.namespace - name = package_url.name - # Find all versions of this package - query_params = f'g:{namespace}+AND+a:{name}' - url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' - response = requests.get(url) - if response: - package_listings = response.json().get('response', {}).get('docs', []) - for listing in package_listings: - purl = PackageURL( - type='maven', - namespace=listing.get('g'), - name=listing.get('a'), - version=listing.get('v') - ) - emsg = map_maven_binary_and_source(purl) - if emsg: - error += emsg - return error - - -@priority_router.route('pkg:maven/.*') -def process_request(purl_str): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from maven and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' - return error - - has_version = bool(package_url.version) - if has_version: - error = map_maven_binary_and_source(package_url) - else: - error = map_maven_packages(package_url) - - return error - - -collect_links = re.compile(r'href="([^"]+)"').findall -collect_links_and_artifact_timestamps = re.compile( - r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' -).findall - - -def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): - """ - Return True if `file_name` is in `links` - """ - return any(l.endswith(file_name) for l in links) - - -def check_if_page_has_pom_files(links, **kwargs): - """ - Return True of any entry in `links` ends with .pom. - """ - return any(l.endswith('.pom') for l in links) - - -def check_if_page_has_directories(links, **kwargs): - """ - Return True if any entry, excluding "../", ends with /. - """ - return any(l.endswith('/') for l in links if l != '../') - - -def check_if_package_version_page(links, **kwargs): - """ - Return True if `links` contains pom files and has no directories - """ - return ( - check_if_page_has_pom_files(links=links) - and not check_if_page_has_directories(links=links) - ) - - -def check_if_package_page(links, **kwargs): - return ( - check_if_file_name_is_linked_on_page(file_name='maven-metadata.xml', links=links) - and not check_if_page_has_pom_files(links=links) - ) - - -def check_if_maven_root(links, **kwargs): - """ - Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven - repo contains "archetype-catalog.xml". - """ - return check_if_file_name_is_linked_on_page(file_name='archetype-catalog.xml', links=links) - - -def check_on_page(url, checker): - """ - Return True if there is a link on `url` that is the same as `file_name`, - False otherwise. - """ - response = requests.get(url) - if response: - links = collect_links(response.text) - return checker(links=links) - return False - - -def is_maven_root(url): - """ - Return True if `url` is the root of a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_maven_root) - - -def is_package_page(url): - """ - Return True if `url` is a package page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_page) - - -def is_package_version_page(url): - """ - Return True if `url` is a package version page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_version_page) - - -def url_parts(url): - parsed_url = urlparse(url) - scheme = parsed_url.scheme - netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split('/') if p] - return scheme, netloc, path_segments - - -def create_url(scheme, netloc, path_segments): - url_template = f'{scheme}://{netloc}' - path = '/'.join(path_segments) - return f'{url_template}/{path}' - - -def get_maven_root(url): - """ - Given `url`, that is a URL to namespace, package, or artifact in a Maven - repo, return the URL to the root of that repo. If a Maven root cannot be - determined, return None. - - >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - 'https://repo1.maven.org/maven2' - """ - scheme, netloc, path_segments = url_parts(url) - for i in range(len(path_segments)): - segments = path_segments[:i+1] - url_segment = create_url(scheme, netloc, segments) - if is_maven_root(url_segment): - return url_segment - return None - - -def determine_namespace_name_version_from_url(url, root_url=None): - """ - Return a 3-tuple containing strings of a Package namespace, name, and - version, determined from `url`, where `url` points to namespace, package, - specific package version, or artifact on a Maven repo. - - Return None if a Maven root cannot be determined from `url`. - - >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ('net.shibboleth', 'parent', '7.11.0') - """ - if not root_url: - root_url = get_maven_root(url) - if not root_url: - raise Exception(f'Error: not a Maven repository: {url}') - - _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split('/') - remaining_path_segments = [p for p in remaining_path_segments if p] - - namespace_segments = [] - package_name = '' - package_version = '' - for i in range(len(remaining_path_segments)): - segment = remaining_path_segments[i] - segments = remaining_path_segments[:i+1] - path = '/'.join(segments) - url_segment = f'{root_url}/{path}' - if is_package_page(url_segment): - package_name = segment - elif is_package_version_page(url_segment): - package_version = segment - else: - namespace_segments.append(segment) - namespace = '.'.join(namespace_segments) - return namespace, package_name, package_version - - -def add_to_import_queue(url, root_url): - """ - Create ImportableURI for the Maven repo package page at `url`. - """ - from minecode.models import ImportableURI - data = None - response = requests.get(url) - if response: - data = response.text - namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) - purl = PackageURL( - type='maven', - namespace=namespace, - name=name, - ) - importable_uri = ImportableURI.objects.insert(url, data, purl) - if importable_uri: - logger.info(f'Inserted {url} into ImportableURI queue') - - -def filter_only_directories(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - if link != '../' and link.endswith('/'): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -valid_artifact_extensions = [ - 'ejb3', - 'ear', - 'aar', - 'apk', - 'gem', - 'jar', - 'nar', - # 'pom', - 'so', - 'swc', - 'tar', - 'tar.gz', - 'war', - 'xar', - 'zip', -] - - -def filter_for_artifacts(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are the filenames - of Maven artifacts, return a mapping of filenames whose extension is in - `valid_artifact_extensions` and their timestamps. - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - for ext in valid_artifact_extensions: - if link.endswith(ext): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -def collect_links_from_text(text, filter): - """ - Return a mapping of link locations and their timestamps, given HTML `text` - content, that is filtered using `filter`. - """ - links_and_timestamps = collect_links_and_artifact_timestamps(text) - timestamps_by_links = {} - for link, timestamp in links_and_timestamps: - if timestamp == '-': - timestamp = '' - timestamps_by_links[link] = timestamp - - timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) - return timestamps_by_links - - -def create_absolute_urls_for_links(text, url, filter): - """ - Given the `text` contents from `url`, return a mapping of absolute URLs to - links from `url` and their timestamps, that is then filtered by `filter`. - """ - timestamps_by_absolute_links = {} - url = url.rstrip('/') - timestamps_by_links = collect_links_from_text(text, filter) - for link, timestamp in timestamps_by_links.items(): - if not link.startswith(url): - link = f'{url}/{link}' - timestamps_by_absolute_links[link] = timestamp - return timestamps_by_absolute_links - - -def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_directory_links = {} - response = requests.get(url) - if response: - timestamps_by_directory_links = create_absolute_urls_for_links( - response.text, - url=url, - filter=filter_only_directories - ) - return timestamps_by_directory_links - - -def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_artifact_links = [] - response = requests.get(url) - if response: - timestamps_by_artifact_links = create_absolute_urls_for_links( - response.text, - url=url, - filter=filter_for_artifacts - ) - return timestamps_by_artifact_links - - -def crawl_to_package(url, root_url): - """ - Given a maven repo `url`, - """ - if is_package_page(url): - add_to_import_queue(url, root_url) - return - - for link in get_directory_links(url): - crawl_to_package(link, root_url) - - -def crawl_maven_repo_from_root(root_url): - """ - Given the `url` to a maven root, traverse the repo depth-first and add - packages to the import queue. - """ - crawl_to_package(root_url, root_url) - - -def get_artifact_sha1(artifact_url): - """ - Return the SHA1 value of the Maven artifact located at `artifact_url`. - """ - sha1 = None - artifact_sha1_url = f'{artifact_url}.sha1' - response = requests.get(artifact_sha1_url) - if response: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - return sha1 - - -def get_classifier_from_artifact_url(artifact_url, package_version_page_url, package_name, package_version): - """ - Return the classifier from a Maven artifact URL `artifact_url`, otherwise - return None if a classifier cannot be determined from `artifact_url` - """ - classifier = None - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip('/') - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' - # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - # ['', '-onejar.jar'] - _, remaining_url_portion = artifact_url.split(leading_url_portion) - # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split('.') - if remaining_url_portions and remaining_url_portions[0]: - # '-onejar' - classifier = remaining_url_portions[0] - if classifier.startswith('-'): - # 'onejar' - classifier = classifier[1:] - return classifier - - @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): diff --git a/packagedb/find_source_repo.py b/packagedb/find_source_repo.py index 83f9eba0..2ec7fe18 100644 --- a/packagedb/find_source_repo.py +++ b/packagedb/find_source_repo.py @@ -21,7 +21,7 @@ from scancode.api import get_urls as get_urls_from_location from minecode.model_utils import add_package_to_scan_queue -from minecode.visitors.maven import get_merged_ancestor_package_from_maven_package +from minecode.collectors.maven import get_merged_ancestor_package_from_maven_package from packagedb.models import Package from packagedb.models import PackageContentType from packagedb.models import PackageSet @@ -170,7 +170,7 @@ def add_source_repo_to_package_set( package_content=PackageContentType.SOURCE_REPO, ) package_sets = package.package_sets.all() - if not package_sets: + if not package_sets: # Create a Package set if we don't have one package_set = PackageSet.objects.create() package_set.add_to_package_set(package) diff --git a/packagedb/management/commands/create_source_repo_packages.py b/packagedb/management/commands/create_source_repo_packages.py index 612b7eb8..bdccbc33 100644 --- a/packagedb/management/commands/create_source_repo_packages.py +++ b/packagedb/management/commands/create_source_repo_packages.py @@ -9,7 +9,6 @@ import logging import sys -from uuid import uuid4 import openpyxl from packageurl.contrib.django.utils import purl_to_lookups @@ -94,10 +93,10 @@ def handle(self, *args, **options): # binary packages can only be part of one package set add_source_repo_to_package_set(source_repo_type = row['source_type'], source_repo_name = row['source_name'], - source_repo_namespace = row['source_namespace'], + source_repo_namespace = row['source_namespace'], source_repo_version = row['source_version'], download_url = row['source_download_url'], - purl=purl, - source_purl=source_purl, + purl=purl, + source_purl=source_purl, package=package, ) From ada55ea5f2d237747db2c5bbe0157122b48a1ad6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 5 Feb 2024 18:08:59 -0800 Subject: [PATCH 02/31] Move npm code #283 Signed-off-by: Jono Yang --- minecode/collectors/npm.py | 84 ++++++++++++++++++++++++++++++++++++++ minecode/tests/test_npm.py | 5 ++- minecode/visitors/npm.py | 78 ----------------------------------- 3 files changed, 87 insertions(+), 80 deletions(-) create mode 100644 minecode/collectors/npm.py diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py new file mode 100644 index 00000000..818e645f --- /dev/null +++ b/minecode/collectors/npm.py @@ -0,0 +1,84 @@ +import logging + +import requests +from packagedcode.npm import NpmPackageJsonHandler, npm_api_url +from packageurl import PackageURL + +from minecode import priority_router +from packagedb.models import PackageContentType + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def get_package_json(namespace, name, version): + """ + Return the contents of the package.json file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + url = npm_api_url( + namespace=namespace, + name=name, + version=version, + ) + + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as err: + logger.error(f"HTTP error occurred: {err}") + + +def map_npm_package(package_url): + """ + Add a npm `package_url` to the PackageDB. + + Return an error string if any errors are encountered during the process + """ + from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + + package_json = get_package_json( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + ) + + if not package_json: + error = f"Package does not exist on npmjs: {package_url}" + logger.error(error) + return error + + package = NpmPackageJsonHandler._parse(json_data=package_json) + package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE + + db_package, _, _, error = merge_or_create_package(package, visit_level=0) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return error + + +@priority_router.route("pkg:npm/.*") +def process_request(purl_str): + """ + Process `priority_resource_uri` containing a npm Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from npm and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. + """ + package_url = PackageURL.from_string(purl_str) + if not package_url.version: + return + + error_msg = map_npm_package(package_url) + + if error_msg: + return error_msg diff --git a/minecode/tests/test_npm.py b/minecode/tests/test_npm.py index 745aabe2..eb36ef1c 100644 --- a/minecode/tests/test_npm.py +++ b/minecode/tests/test_npm.py @@ -22,6 +22,7 @@ import packagedb from minecode import mappers from minecode import route +from minecode.collectors import npm as npm_collector from minecode.models import ResourceURI from minecode.utils_test import JsonBasedTesting from minecode.utils_test import mocked_requests_get @@ -184,7 +185,7 @@ def setUp(self): ) def test_get_package_json(self, regen=False): - json_contents = npm.get_package_json( + json_contents = npm_collector.get_package_json( namespace=self.scan_package.namespace, name=self.scan_package.name, version=self.scan_package.version @@ -198,7 +199,7 @@ def test_map_npm_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - npm.map_npm_package(package_url) + npm_collector.map_npm_package(package_url) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() diff --git a/minecode/visitors/npm.py b/minecode/visitors/npm.py index 26fca883..afcc342b 100644 --- a/minecode/visitors/npm.py +++ b/minecode/visitors/npm.py @@ -10,20 +10,16 @@ import logging import json -import requests from packageurl import PackageURL from packagedcode.npm import npm_api_url from packagedcode.npm import split_scoped_package_name -from packagedcode.npm import NpmPackageJsonHandler from minecode import seed from minecode import visit_router -from minecode import priority_router from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI -from packagedb.models import PackageContentType """ @@ -107,77 +103,3 @@ def get_uris(self, content): data=json.dumps(doc, separators=(',', ':'), ensure_ascii=False), # note: visited is True since there nothing more to visit visited=True) - - -def get_package_json(namespace, name, version): - """ - Return the contents of the package.json file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - url = npm_api_url( - namespace=namespace, - name=name, - version=version, - ) - - try: - response = requests.get(url) - response.raise_for_status() - return response.json() - except requests.exceptions.HTTPError as err: - logger.error(f"HTTP error occurred: {err}") - - -def map_npm_package(package_url): - """ - Add a npm `package_url` to the PackageDB. - - Return an error string if any errors are encountered during the process - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - package_json = get_package_json( - namespace = package_url.namespace, - name=package_url.name, - version=package_url.version, - ) - - if not package_json: - error = f'Package does not exist on npmjs: {package_url}' - logger.error(error) - return error - - package = NpmPackageJsonHandler._parse( - json_data=package_json - ) - package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE - - db_package, _, _, error = merge_or_create_package(package, visit_level=0) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package) - - return error - - -@priority_router.route('pkg:npm/.*') -def process_request(purl_str): - """ - Process `priority_resource_uri` containing a npm Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from npm and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. - """ - package_url = PackageURL.from_string(purl_str) - if not package_url.version: - return - - error_msg = map_npm_package(package_url) - - if error_msg: - return error_msg From cb4abadf2d68ffb117bd49aed61cbbe7ad22d0a7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 6 Feb 2024 19:22:42 -0800 Subject: [PATCH 03/31] Create viewset for ScannableURI #49 Signed-off-by: Jono Yang --- minecode/api.py | 73 ++++++++++++++++++++++++++++++++++++++---- purldb_project/urls.py | 3 +- 2 files changed, 69 insertions(+), 7 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index b282ef56..eb7ca1ec 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -7,19 +7,19 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from rest_framework import serializers -from rest_framework import status -from rest_framework import viewsets +import json +from django.db import transaction +from packageurl import PackageURL +from rest_framework import serializers, status, viewsets from rest_framework.decorators import action from rest_framework.response import Response -from packageurl import PackageURL # UnusedImport here! # But importing the mappers and visitors module triggers routes registration from minecode import visitors # NOQA from minecode import priority_router -from minecode.models import ResourceURI -from minecode.models import PriorityResourceURI +from minecode.management.commands.process_scans import index_package_files +from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI class ResourceURISerializer(serializers.ModelSerializer): @@ -83,3 +83,64 @@ def index_package(self, request, *args, **kwargs): } # TODO: revisiting a package should be handled on another level, dependent on data we store return Response(message) + + +class ScannableURISerializer(serializers.ModelSerializer): + class Meta: + model = ScannableURI + fields = '__all__' + + +# TODO: guard these API endpoints behind an API key +class ScannableURIViewSet(viewsets.ModelViewSet): + queryset = ScannableURI.objects.all() + serializer_class = ScannableURISerializer + + @action(detail=False, methods=["get"]) + def get_next_download_url(self, request, *args, **kwargs): + """ + Return download url for next Package on scan queue + """ + with transaction.atomic(): + scannable_uri = ScannableURI.objects.get_next_scannable() + if scannable_uri: + response = { + 'package_uuid': scannable_uri.package.uuid, + 'download_url': scannable_uri.uri, + } + scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED + scannable_uri.save() + else: + response = { + 'message': 'no more packages on scan queue' + } + return Response(response) + + @action(detail=False, methods=["post"]) + def submit_scan_results(self, request, *args, **kwargs): + """ + Receive and index completed scan + """ + from packagedb.models import Package + + package_uuid = request.data.get('package_uuid') + scan_file = request.data.get('scan_file') + + missing = [] + if not package_uuid: + missing.append('package_uuid') + if not scan_file: + missing.append('scan_file') + if missing: + msg = ', '.join(missing) + response = { + 'error': f'missing {msg}' + } + return Response(response) + + package = Package.objects.get(uuid=package_uuid) + scan_data= json.load(scan_file) + indexing_errors = index_package_files(package, scan_data, reindex=True) + if indexing_errors: + return Response({'error': f'indexing errors:\n\n{indexing_errors}'}) + return Response({'message': 'success'}) diff --git a/purldb_project/urls.py b/purldb_project/urls.py index 63a867ed..615a6353 100644 --- a/purldb_project/urls.py +++ b/purldb_project/urls.py @@ -24,6 +24,7 @@ from matchcode.api import ExactFileIndexViewSet from matchcode.api import ExactPackageArchiveIndexViewSet from minecode.api import PriorityResourceURIViewSet +from minecode.api import ScannableURIViewSet from packagedb.api import PurlValidateViewSet from packagedb.api import CollectViewSet from drf_spectacular.views import SpectacularAPIView @@ -43,7 +44,7 @@ api_router.register('validate', PurlValidateViewSet, 'validate') api_router.register('collect', CollectViewSet, 'collect') api_router.register('watch',PackageWatchViewSet) - +api_router.register('scan_queue', ScannableURIViewSet) urlpatterns = [ path( From fb9e6961a8715cab07082c2af5a68c9362130c26 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 9 Feb 2024 17:04:08 -0800 Subject: [PATCH 04/31] Return empty strings if nothing else on scan queue #49 Signed-off-by: Jono Yang --- minecode/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minecode/api.py b/minecode/api.py index eb7ca1ec..b8eea89e 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -112,7 +112,8 @@ def get_next_download_url(self, request, *args, **kwargs): scannable_uri.save() else: response = { - 'message': 'no more packages on scan queue' + 'package_uuid': "", + 'download_url': "", } return Response(response) From 93ad167fd175d9f634229b7980c37951edd6cf8e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 14 Feb 2024 13:05:47 -0800 Subject: [PATCH 05/31] Update ScannableURI API and model #285 Signed-off-by: Jono Yang --- minecode/api.py | 58 +++++++++++++++++++++++++++------------------- minecode/models.py | 26 ++++++++------------- 2 files changed, 44 insertions(+), 40 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index b8eea89e..1636c9ed 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from django.utils import timezone import json from django.db import transaction from packageurl import PackageURL @@ -105,10 +106,11 @@ def get_next_download_url(self, request, *args, **kwargs): scannable_uri = ScannableURI.objects.get_next_scannable() if scannable_uri: response = { - 'package_uuid': scannable_uri.package.uuid, + 'scannable_uri_uuid': scannable_uri.uuid, 'download_url': scannable_uri.uri, } scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED + scannable_uri.scan_date = timezone.now() scannable_uri.save() else: response = { @@ -118,30 +120,38 @@ def get_next_download_url(self, request, *args, **kwargs): return Response(response) @action(detail=False, methods=["post"]) - def submit_scan_results(self, request, *args, **kwargs): - """ - Receive and index completed scan - """ - from packagedb.models import Package - - package_uuid = request.data.get('package_uuid') - scan_file = request.data.get('scan_file') - - missing = [] - if not package_uuid: - missing.append('package_uuid') - if not scan_file: - missing.append('scan_file') - if missing: - msg = ', '.join(missing) + def update_status(self, request, *args, **kwargs): + scannable_uri_uuid = request.data.get('scannable_uri_uuid') + scan_status = request.data.get('scan_status') + if not scannable_uri_uuid: response = { - 'error': f'missing {msg}' + 'error': 'missing scannable_uri_uuid' } return Response(response) - package = Package.objects.get(uuid=package_uuid) - scan_data= json.load(scan_file) - indexing_errors = index_package_files(package, scan_data, reindex=True) - if indexing_errors: - return Response({'error': f'indexing errors:\n\n{indexing_errors}'}) - return Response({'message': 'success'}) + scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) + + if scan_status == 'in progress': + scannable_uri.scan_status = ScannableURI.SCAN_IN_PROGRESS + scannable_uri.save() + + if scan_status == 'failed': + scan_log = request.data.get('scan_log') + scannable_uri.scan_error = scan_log + scannable_uri.scan_status = ScannableURI.SCAN_FAILED + scannable_uri.wip_date = None + scannable_uri.save() + + if scan_status == 'scanned': + scan_file = request.data.get('scan_file') + scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED + package = scannable_uri.package + scan_data= json.load(scan_file) + indexing_errors = index_package_files(package, scan_data, reindex=True) + if indexing_errors: + scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED + scannable_uri.index_error = indexing_errors + else: + scannable_uri.scan_status = ScannableURI.SCAN_INDEXED + scannable_uri.wip_date = None + scannable_uri.save() diff --git a/minecode/models.py b/minecode/models.py index 3a6f046e..93c903a0 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -7,7 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - +import uuid from datetime import timedelta import logging import sys @@ -649,11 +649,15 @@ class ScannableURI(BaseURI): - update the matching index for the PackageDB as needed with fingerprints from the scan - set status and timestamps as needed """ - scan_request_date = models.DateTimeField( + uuid = models.UUIDField( + verbose_name=_("UUID"), default=uuid.uuid4, unique=True, editable=False + ) + + scan_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date when a scan was requested. Used to track scan status.', + help_text='Timestamp set to the date when a scan was taken by a worker', ) last_status_poll_date = models.DateTimeField( @@ -664,12 +668,10 @@ class ScannableURI(BaseURI): 'Used to track the scan polling.', ) - scan_uuid = models.CharField( - max_length=36, - blank=True, - null=True, + scan_project_url = models.CharField( + max_length=2048, db_index=True, - help_text='UUID of a scan for this URI in ScanCode.io.', + help_text='URL to scan project for this Package', ) SCAN_NEW = 0 @@ -708,14 +710,6 @@ class ScannableURI(BaseURI): help_text='Flag indicating whether or not this URI should be rescanned and reindexed.', ) - scan_uuid = models.CharField( - max_length=36, - blank=True, - null=True, - db_index=True, - help_text='UUID of a scan for this URI in ScanCode.io.', - ) - scan_error = models.TextField( null=True, blank=True, From 4739fb7aaed5173a0eb6c2bee09489cbf714f23a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 14 Feb 2024 17:53:13 -0800 Subject: [PATCH 06/31] Update scan_project_url in update_status #49 #285 Signed-off-by: Jono Yang --- minecode/api.py | 2 ++ minecode/models.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 1636c9ed..f6a8eca7 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -132,6 +132,8 @@ def update_status(self, request, *args, **kwargs): scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) if scan_status == 'in progress': + scan_project_url = request.data.get('scan_project_url') + scannable_uri.scan_project_url = scan_project_url scannable_uri.scan_status = ScannableURI.SCAN_IN_PROGRESS scannable_uri.save() diff --git a/minecode/models.py b/minecode/models.py index 93c903a0..8ba5a96f 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -650,7 +650,9 @@ class ScannableURI(BaseURI): - set status and timestamps as needed """ uuid = models.UUIDField( - verbose_name=_("UUID"), default=uuid.uuid4, unique=True, editable=False + default=uuid.uuid4, + unique=True, + editable=False, ) scan_date = models.DateTimeField( @@ -671,6 +673,8 @@ class ScannableURI(BaseURI): scan_project_url = models.CharField( max_length=2048, db_index=True, + null=True, + blank=True, help_text='URL to scan project for this Package', ) @@ -733,12 +737,12 @@ class ScannableURI(BaseURI): class Meta: verbose_name = 'Scannable URI' - unique_together = ['canonical', 'scan_uuid'] + unique_together = ['canonical', 'scan_project_url'] indexes = [ # to get the scannables models.Index( - fields=['scan_status', 'scan_request_date', 'last_status_poll_date', ]), + fields=['scan_status', 'scan_date', 'last_status_poll_date', ]), # ordered by for the main queue query e.g. '-priority' models.Index( fields=['-priority']) From 20a33bc2c7c1a2130039fc78b002778e07066acf Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 15 Feb 2024 17:45:23 -0800 Subject: [PATCH 07/31] Modify ScannableURI fields #49 #285 * Create test for ScannableURI API Signed-off-by: Jono Yang --- minecode/api.py | 2 +- ...minecode_sc_scan_st_d6a459_idx_and_more.py | 64 +++++++++++++++ .../0033_scannableuri_populate_uuid.py | 43 ++++++++++ .../0034_scannableuri_alter_uuid_field.py | 19 +++++ minecode/models.py | 13 ++- minecode/tests/test_api.py | 80 +++++++++++++++++++ 6 files changed, 217 insertions(+), 4 deletions(-) create mode 100644 minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py create mode 100644 minecode/migrations/0033_scannableuri_populate_uuid.py create mode 100644 minecode/migrations/0034_scannableuri_alter_uuid_field.py create mode 100644 minecode/tests/test_api.py diff --git a/minecode/api.py b/minecode/api.py index f6a8eca7..d8bb4c0c 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -114,7 +114,7 @@ def get_next_download_url(self, request, *args, **kwargs): scannable_uri.save() else: response = { - 'package_uuid': "", + 'scannable_uri_uuid': "", 'download_url': "", } return Response(response) diff --git a/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py b/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py new file mode 100644 index 00000000..7c31cc30 --- /dev/null +++ b/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py @@ -0,0 +1,64 @@ +# Generated by Django 5.0.1 on 2024-02-15 23:16 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("minecode", "0031_importableuri"), + ("packagedb", "0083_delete_apiuser"), + ] + + operations = [ + migrations.RemoveIndex( + model_name="scannableuri", + name="minecode_sc_scan_st_d6a459_idx", + ), + migrations.AlterUniqueTogether( + name="scannableuri", + unique_together=set(), + ), + migrations.AddField( + model_name="scannableuri", + name="scan_date", + field=models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the date when a scan was taken by a worker", + null=True, + ), + ), + migrations.AddField( + model_name="scannableuri", + name="scan_project_url", + field=models.CharField( + blank=True, + db_index=True, + help_text="URL to scan project for this Package", + max_length=2048, + null=True, + ), + ), + migrations.AddField( + model_name="scannableuri", + name="uuid", + field=models.UUIDField(default=uuid.uuid4, null=True), + ), + migrations.AddIndex( + model_name="scannableuri", + index=models.Index( + fields=["scan_status", "scan_date", "last_status_poll_date"], + name="minecode_sc_scan_st_5e04d7_idx", + ), + ), + migrations.RemoveField( + model_name="scannableuri", + name="scan_request_date", + ), + migrations.RemoveField( + model_name="scannableuri", + name="scan_uuid", + ), + ] diff --git a/minecode/migrations/0033_scannableuri_populate_uuid.py b/minecode/migrations/0033_scannableuri_populate_uuid.py new file mode 100644 index 00000000..e96b15a6 --- /dev/null +++ b/minecode/migrations/0033_scannableuri_populate_uuid.py @@ -0,0 +1,43 @@ +# Generated by Django 5.0.1 on 2024-02-16 00:16 + +from django.db import migrations +import uuid + + +def populate_uuid(apps, schema_editor): + ScannableURI = apps.get_model("minecode", "ScannableURI") + unupdated = [] + scannable_uris = ScannableURI.objects.all().iterator(chunk_size=5000) + for i, scannable_uri in enumerate(scannable_uris): + if i > 0 and not i % 5000: + ScannableURI.objects.bulk_update( + objs=unupdated, + fields=[ + "uuid" + ] + ) + unupdated = [] + scannable_uri.uuid = uuid.uuid4() + unupdated.append(scannable_uri) + + if unupdated: + ScannableURI.objects.bulk_update( + objs=unupdated, + fields=[ + "uuid", + ] + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ( + "minecode", + "0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more", + ), + ] + + operations = [ + migrations.RunPython(populate_uuid, reverse_code=migrations.RunPython.noop), + ] diff --git a/minecode/migrations/0034_scannableuri_alter_uuid_field.py b/minecode/migrations/0034_scannableuri_alter_uuid_field.py new file mode 100644 index 00000000..28d88ace --- /dev/null +++ b/minecode/migrations/0034_scannableuri_alter_uuid_field.py @@ -0,0 +1,19 @@ +# Generated by Django 5.0.1 on 2024-02-16 00:16 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("minecode", "0033_scannableuri_populate_uuid"), + ] + + operations = [ + migrations.AlterField( + model_name="scannableuri", + name="uuid", + field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True), + ), + ] diff --git a/minecode/models.py b/minecode/models.py index 8ba5a96f..ade275f7 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -737,15 +737,22 @@ class ScannableURI(BaseURI): class Meta: verbose_name = 'Scannable URI' - unique_together = ['canonical', 'scan_project_url'] indexes = [ # to get the scannables models.Index( - fields=['scan_status', 'scan_date', 'last_status_poll_date', ]), + fields=[ + 'scan_status', + 'scan_date', + 'last_status_poll_date', + ] + ), # ordered by for the main queue query e.g. '-priority' models.Index( - fields=['-priority']) + fields=[ + '-priority' + ] + ) ] def save(self, *args, **kwargs): diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py new file mode 100644 index 00000000..aedacd5b --- /dev/null +++ b/minecode/tests/test_api.py @@ -0,0 +1,80 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from uuid import uuid4 +import json +import os + +from django.contrib.postgres.search import SearchVector +from django.test import TestCase +from django.urls import reverse +from django.utils import timezone +from rest_framework import status +from rest_framework.test import APIClient + +from minecode.models import PriorityResourceURI +from minecode.models import ScannableURI +from minecode.utils_test import JsonBasedTesting +from packagedb.models import Package +from packagedb.models import PackageContentType +from packagedb.models import PackageSet +from packagedb.models import Resource +from minecode.models import ScannableURI + +from unittest import mock +from univers.versions import MavenVersion + +class ScannableURIAPITestCase(JsonBasedTesting, TestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + + def setUp(self): + self.package1 = Package.objects.create( + download_url='https://test-url.com/package1.tar.gz', + type='type1', + name='name1', + version='1.0', + ) + self.scannable_uri1 = ScannableURI.objects.create( + uri='https://test-url.com/package1.tar.gz', + package=self.package1 + ) + + self.package2 = Package.objects.create( + download_url='https://test-url.com/package2.tar.gz', + type='type2', + name='name2', + version='2.0', + ) + self.scannable_uri2 = ScannableURI.objects.create( + uri='https://test-url.com/package2.tar.gz', + package=self.package2 + ) + + self.client = APIClient() + + def test_api_scannable_uri_list_endpoint(self): + response = self.client.get('/api/scan_queue/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(2, response.data.get('count')) + + def test_api_scannable_uri_get_next_download_url(self): + response = self.client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri1.uuid) + self.assertEqual(response.data.get('download_url'), self.scannable_uri1.uri) + + response = self.client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri2.uuid) + self.assertEqual(response.data.get('download_url'), self.scannable_uri2.uri) + + response = self.client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data.get('scannable_uri_uuid'), '') + self.assertEqual(response.data.get('download_url'), '') From e07cef05f8518ab186808094d5cc1c994cdb309f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 16 Feb 2024 15:39:17 -0800 Subject: [PATCH 08/31] Ensure a response is returned in update_status #49 #285 Signed-off-by: Jono Yang --- minecode/api.py | 34 +++++++++++++++++++++++++++++----- minecode/tests/test_api.py | 25 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index d8bb4c0c..68fd66ab 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -133,18 +133,29 @@ def update_status(self, request, *args, **kwargs): if scan_status == 'in progress': scan_project_url = request.data.get('scan_project_url') - scannable_uri.scan_project_url = scan_project_url - scannable_uri.scan_status = ScannableURI.SCAN_IN_PROGRESS - scannable_uri.save() + if scan_project_url: + scannable_uri.scan_project_url = scan_project_url + scannable_uri.scan_status = ScannableURI.SCAN_IN_PROGRESS + scannable_uri.save() + msg = { + 'status': f'scan_status updated to {scan_status} for scannable_uri {scannable_uri_uuid}' + } + else: + msg = { + 'status': f'missing scan_project_url when updating scannable_uri {scannable_uri_uuid} scan_status to {scan_status}' + } - if scan_status == 'failed': + elif scan_status == 'failed': scan_log = request.data.get('scan_log') scannable_uri.scan_error = scan_log scannable_uri.scan_status = ScannableURI.SCAN_FAILED scannable_uri.wip_date = None scannable_uri.save() + msg = { + 'status': f'updated scannable uri {scannable_uri_uuid} scan_status to {scan_status}' + } - if scan_status == 'scanned': + elif scan_status == 'scanned': scan_file = request.data.get('scan_file') scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED package = scannable_uri.package @@ -153,7 +164,20 @@ def update_status(self, request, *args, **kwargs): if indexing_errors: scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors + msg = { + 'status': f'scan index failed for scannable uri {scannable_uri_uuid}' + } else: scannable_uri.scan_status = ScannableURI.SCAN_INDEXED + msg = { + 'status': f'scan indexed for scannable uri {scannable_uri_uuid}' + } scannable_uri.wip_date = None scannable_uri.save() + + else: + msg = { + 'status': f'invalid scan_status: {scan_status}' + } + + return Response(msg) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index aedacd5b..bc64ef76 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -78,3 +78,28 @@ def test_api_scannable_uri_get_next_download_url(self): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), '') self.assertEqual(response.data.get('download_url'), '') + + def test_api_scannable_uri_update_status(self): + self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) + + data = { + "scannable_uri_uuid": self.scannable_uri1.uuid, + "scan_status": 'in progress', + 'scan_project_url': 'scan_project_url', + } + response = self.client.post('/api/scan_queue/update_status/', data=data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.scannable_uri1.refresh_from_db() + self.assertEqual(ScannableURI.SCAN_IN_PROGRESS, self.scannable_uri1.scan_status) + self.assertEqual('scan_project_url', self.scannable_uri1.scan_project_url) + + data = { + "scannable_uri_uuid": self.scannable_uri1.uuid, + "scan_status": 'failed', + 'scan_log': 'scan_log', + } + response = self.client.post('/api/scan_queue/update_status/', data=data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.scannable_uri1.refresh_from_db() + self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) + self.assertEqual('scan_log', self.scannable_uri1.scan_error) From 15f70c58c1ceae985ab1b977463350af52d3c261 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 16 Feb 2024 18:45:21 -0800 Subject: [PATCH 09/31] Add test for scan indexing through API #49 #285 Signed-off-by: Jono Yang --- minecode/tests/test_api.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index bc64ef76..35f98582 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -103,3 +103,18 @@ def test_api_scannable_uri_update_status(self): self.scannable_uri1.refresh_from_db() self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) self.assertEqual('scan_log', self.scannable_uri1.scan_error) + + self.assertEqual(0, Resource.objects.all().count()) + scan_file = self.get_test_loc('scancodeio/get_scan_data.json') + with open(scan_file) as f: + data = { + "scannable_uri_uuid": self.scannable_uri1.uuid, + "scan_status": 'scanned', + 'scan_file': f, + } + response = self.client.post('/api/scan_queue/update_status/', data=data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.scannable_uri1.refresh_from_db() + self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri1.scan_status) + self.assertEqual('scan_log', self.scannable_uri1.scan_error) + self.assertEqual(64, Resource.objects.all().count()) From 79b4aa7a9789c19a261f912fa52d7a3a48021917 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 20 Feb 2024 18:18:39 -0800 Subject: [PATCH 10/31] Renam instances of rescan to reindex #49 #285 Signed-off-by: Jono Yang --- ...ame_rescan_uri_scannableuri_reindex_uri.py | 18 +++++++++++ minecode/model_utils.py | 3 +- minecode/models.py | 15 +--------- packagedb/api.py | 30 +++++++++---------- packagedb/models.py | 21 ++++--------- 5 files changed, 42 insertions(+), 45 deletions(-) create mode 100644 minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py diff --git a/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py b/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py new file mode 100644 index 00000000..7719561d --- /dev/null +++ b/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.1 on 2024-02-21 02:12 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("minecode", "0034_scannableuri_alter_uuid_field"), + ] + + operations = [ + migrations.RenameField( + model_name="scannableuri", + old_name="rescan_uri", + new_name="reindex_uri", + ), + ] diff --git a/minecode/model_utils.py b/minecode/model_utils.py index a600b104..b5265428 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -24,7 +24,7 @@ logger.setLevel(logging.INFO) -def add_package_to_scan_queue(package): +def add_package_to_scan_queue(package, reindex_uri=False): """ Add a Package `package` to the scan queue """ @@ -32,6 +32,7 @@ def add_package_to_scan_queue(package): _, scannable_uri_created = ScannableURI.objects.get_or_create( uri=uri, package=package, + reindex_uri=reindex_uri, ) if scannable_uri_created: logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) diff --git a/minecode/models.py b/minecode/models.py index ade275f7..16d121cf 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -707,7 +707,7 @@ class ScannableURI(BaseURI): help_text='Status of the scan for this URI.', ) - rescan_uri = models.BooleanField( + reindex_uri = models.BooleanField( default=False, null=True, blank=True, @@ -764,19 +764,6 @@ def save(self, *args, **kwargs): self.normalize_fields() super(ScannableURI, self).save(*args, **kwargs) - def rescan(self): - """ - Reset fields such that a ScannableURI can be sent off for scanning again - """ - self.rescan_uri = True - self.scan_status = ScannableURI.SCAN_NEW - self.scan_error = None - self.index_error = None - self.scan_uuid = None - self.scan_request_date = None - self.priority = 100 - self.save() - # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class PriorityResourceURIManager(models.Manager): diff --git a/packagedb/api.py b/packagedb/api.py index d11be44d..b665d299 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -426,7 +426,7 @@ def reindex_package(self, request, *args, **kwargs): Reindex this package instance """ package = self.get_object() - package.rescan() + package.reindex() data = { 'status': f'{package.package_url} has been queued for reindexing' } @@ -557,7 +557,7 @@ class PackageSetViewSet(viewsets.ReadOnlyModelViewSet): class PackageWatchViewSet(CreateListRetrieveUpdateViewSetMixin): """ Take a `purl` and periodically watch for the new version of the package. - Add the new package version to the scan queue. + Add the new package version to the scan queue. Default watch interval is 7 days. """ queryset = PackageWatch.objects.get_queryset().order_by('-id') @@ -579,7 +579,7 @@ class CollectViewSet(viewsets.ViewSet): If the package does not exist, we will fetch the Package data and return it in the same request. - + **Note:** Use `Index packages` for bulk indexing/reindexing of packages. """ serializer_class=None @@ -622,7 +622,7 @@ def list(self, request, format=None): serializer = PackageAPISerializer(packages, many=True, context={'request': request}) return Response(serializer.data) - + @extend_schema( request=IndexPackagesSerializer, responses={ @@ -692,9 +692,9 @@ def index_packages(self, request, *args, **kwargs): def _reindex_package(package, reindexed_packages): if package in reindexed_packages: return - package.rescan() + package.reindex() reindexed_packages.append(package) - + serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): @@ -764,17 +764,17 @@ def _reindex_package(package, reindexed_packages): class PurlValidateViewSet(viewsets.ViewSet): """ - Take a `purl` and check whether it's valid PackageURL or not. - Optionally set `check_existence` to true to check whether the package exists in real world. - - **Note:** As of now `check_existence` only supports `cargo`, `composer`, `deb`, + Take a `purl` and check whether it's valid PackageURL or not. + Optionally set `check_existence` to true to check whether the package exists in real world. + + **Note:** As of now `check_existence` only supports `cargo`, `composer`, `deb`, `gem`, `golang`, `hex`, `maven`, `npm`, `nuget` and `pypi` ecosystems. **Example request:** ``` GET /api/validate/?purl=pkg:npm/foobar@12.3.1&check_existence=false ``` - + Response contains: - valid @@ -783,7 +783,7 @@ class PurlValidateViewSet(viewsets.ViewSet): - True, if input PURL exists in real world and `check_existence` flag is enabled. """ serializer_class = PurlValidateSerializer - + def get_view_name(self): return 'Validate PURL' @@ -858,7 +858,7 @@ def list(self, request): response['exists'] = True else: unsupported_ecosystem = True - + if response['exists']: response["message"] = message_valid_and_exists elif unsupported_ecosystem: @@ -866,7 +866,7 @@ def list(self, request): response["message"] = message_valid_but_package_type_not_supported else: response["message"] =message_valid_but_does_not_exist - + serializer = PurlValidateResponseSerializer(response, context={'request': request}) return Response(serializer.data) @@ -989,7 +989,7 @@ def get_all_versions_plain(purl: PackageURL): def get_all_versions(purl): """ - Return all the versions available for the given purls as + Return all the versions available for the given purls as proper Version objects from `univers`. """ all_versions = get_all_versions_plain(purl) diff --git a/packagedb/models.py b/packagedb/models.py index 35a35e9a..51cba3ab 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -620,24 +620,15 @@ def get_latest_version(self): if sorted_versions: return sorted_versions[-1] - # TODO: Should this be called `reindex` in this context? - def rescan(self): + def reindex(self): """ - Trigger another scan of this Package, where the URI at `download_url` is - sent to scancode.io for a scan. The fingerprints and Resources associated with this - Package are deleted and recreated from the updated scan data. + Trigger another scan of this Package, where a new ScannableURI is + created for this Package. The fingerprints and Resources associated with + this Package are deleted and recreated from the updated scan data. """ - from minecode.models import ScannableURI + from minecode.model_utils import add_package_to_scan_queue - # TODO: Consider sending a new scan request instead of reusing the - # existing one - try: - scannable_uri = ScannableURI.objects.get(package=self) - except ScannableURI.DoesNotExist: - scannable_uri = None - - if scannable_uri: - scannable_uri.rescan() + add_package_to_scan_queue(self, reindex_uri=True) def update_fields(self, save=False, **values_by_fields): """ From c4a7d5227728b7aaea2668bed47d799faf2acaf6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 21 Feb 2024 16:11:03 -0800 Subject: [PATCH 11/31] Create minecode/indexing.py #49 #285 * Update tests Signed-off-by: Jono Yang --- minecode/api.py | 2 +- minecode/management/commands/process_scans.py | 282 ---------- minecode/management/commands/request_scans.py | 104 ---- minecode/management/indexing.py | 135 +++++ minecode/management/scanning.py | 482 ------------------ minecode/model_utils.py | 3 +- minecode/tests/create_scanning_fixtures.py | 60 --- ...test_process_scans.py => test_indexing.py} | 54 +- minecode/tests/test_request_scans.py | 73 --- minecode/tests/test_scanning.py | 194 ------- packagedb/models.py | 2 +- packagedb/tests/test_api.py | 95 ++-- 12 files changed, 195 insertions(+), 1291 deletions(-) delete mode 100644 minecode/management/commands/process_scans.py delete mode 100644 minecode/management/commands/request_scans.py create mode 100644 minecode/management/indexing.py delete mode 100644 minecode/management/scanning.py delete mode 100644 minecode/tests/create_scanning_fixtures.py rename minecode/tests/{test_process_scans.py => test_indexing.py} (54%) delete mode 100644 minecode/tests/test_request_scans.py delete mode 100644 minecode/tests/test_scanning.py diff --git a/minecode/api.py b/minecode/api.py index 68fd66ab..95121d56 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -19,7 +19,7 @@ # But importing the mappers and visitors module triggers routes registration from minecode import visitors # NOQA from minecode import priority_router -from minecode.management.commands.process_scans import index_package_files +from minecode.management.indexing import index_package_files from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI diff --git a/minecode/management/commands/process_scans.py b/minecode/management/commands/process_scans.py deleted file mode 100644 index f8566c4c..00000000 --- a/minecode/management/commands/process_scans.py +++ /dev/null @@ -1,282 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -import logging -import signal -import sys -import traceback - -from django.db import transaction - -from licensedcode.cache import build_spdx_license_expression -from packagedcode.utils import combine_expressions - -from matchcode.models import ApproximateDirectoryContentIndex -from matchcode.models import ApproximateDirectoryStructureIndex -from matchcode.models import ExactFileIndex -from minecode.management import scanning -from minecode.management.commands import get_error_message -from minecode.models import ScannableURI -from minecode.model_utils import merge_or_create_resource - - -logger = logging.getLogger(__name__) -logging.basicConfig(stream=sys.stdout) -logger.setLevel(logging.INFO) - - -class Command(scanning.ScanningCommand): - - logger = logger - - help = ('Check scancode.io requested scans for status then fetch and process ' - 'completed scans for indexing and updates.') - - def handle(self, *args, **options): - logger.setLevel(self.get_verbosity(**options)) - scanning.ScanningCommand.handle(self, *args, **options) - - @classmethod - def get_next_uri(self): - with transaction.atomic(): - scannable_uri = ScannableURI.objects.get_next_processable() - return scannable_uri - - @classmethod - def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_save_loc='', **kwargs): - """ - Process a ScannableURI based on its status. - - For requested but not completed scans, check remote status and - update status and timestamps accordingly. - - For completed scans, fetch the scan, then procpythess the scan results - to update the PackageDB as needed. Update status and timestamps accordingly - """ - logger.info('Checking or processing scan for URI: {}'.format(scannable_uri)) - - scan_info = scanning.get_scan_info( - scannable_uri.scan_uuid, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - get_scan_info_save_loc=get_scan_info_save_loc - ) - rescan = scannable_uri.rescan_uri - - if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): - scannable_uri.scan_status = get_scan_status(scan_info) - elif scannable_uri.scan_status in (ScannableURI.SCAN_COMPLETED,): - scan_index_errors = [] - try: - logger.info('Processing scan for URI: {}'.format(scannable_uri)) - - package = scannable_uri.package - input_size = scan_info.size - if input_size: - computed_timeout = ((input_size / 1000000) / 2) * 60 - timeout = max(computed_timeout, scanning.REQUEST_TIMEOUT) - else: - timeout = scanning.REQUEST_TIMEOUT - scan_data = scanning.get_scan_data( - scannable_uri.scan_uuid, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - timeout=timeout, - get_scan_data_save_loc=get_scan_data_save_loc - ) - indexing_errors = index_package_files(package, scan_data, reindex=rescan) - scan_index_errors.extend(indexing_errors) - - summary = scanning.get_scan_summary( - scannable_uri.scan_uuid, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - get_scan_data_save_loc=get_scan_data_save_loc - ) - - declared_license_expression = summary.get('declared_license_expression') - declared_license_expression_spdx = None - if declared_license_expression: - declared_license_expression_spdx = build_spdx_license_expression(declared_license_expression) - - other_license_expressions = summary.get('other_license_expressions', []) - other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] - other_license_expression = combine_expressions(other_license_expressions) - - copyright = '' - declared_holder = summary.get('declared_holder') - if declared_holder: - copyright = f'Copyright (c) {declared_holder}' - - values_by_updateable_fields = { - 'sha1': scan_info.sha1, - 'sha256': scan_info.sha256, - 'sha512': scan_info.sha512, - 'summary': summary, - 'declared_license_expression': declared_license_expression, - 'declared_license_expression_spdx': declared_license_expression_spdx, - 'other_license_expression': other_license_expression, - 'copyright': copyright, - } - - updated_fields = [] - for field, value in values_by_updateable_fields.items(): - p_val = getattr(package, field) - if ( - (not p_val and value) - or rescan - ): - setattr(package, field, value) - entry = dict( - field=field, - old_value=p_val, - new_value=value, - ) - updated_fields.append(entry) - - if updated_fields: - data = { - 'updated_fields': updated_fields, - } - package.append_to_history( - 'Package field values have been updated.', - data=data, - save=True, - ) - - scannable_uri.scan_status = ScannableURI.SCAN_INDEXED - if rescan: - scannable_uri.rescan = False - scannable_uri.priority = 0 - - except Exception as e: - traceback_message = traceback.format_exc() - error_message = traceback_message + '\n' - # TODO: We should rerun the specific indexers that have failed - if scan_index_errors: - error_message += '\n'.join(scan_index_errors) - scannable_uri.index_error = error_message - scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED - - scannable_uri.wip_date = None - scannable_uri.save() - - -# support graceful death when used as a service -signal.signal(signal.SIGTERM, Command.stop_handler) - - -def get_scan_status(scan_object): - """ - Return a ScannableURI status from scan_object Scan - """ - if scan_object.not_started or scan_object.queued: - scan_status = ScannableURI.SCAN_SUBMITTED - elif scan_object.running: - scan_status = ScannableURI.SCAN_IN_PROGRESS - elif scan_object.failure or scan_object.stopped or scan_object.stale: - scan_status = ScannableURI.SCAN_FAILED - elif scan_object.success: - scan_status = ScannableURI.SCAN_COMPLETED - else: - # TODO: Consider not raising an exception - raise Exception('Unknown scancode.io status') - return scan_status - - -def update_package_checksums(package, scan_object): - """ - Create a new Resource entry for `package` Package if its checksums have been updated - - Return a list of scan error messages - """ - scan_index_errors = [] - try: - updated = _update_package_checksums(package, scan_object) - except Exception as e: - msg = get_error_message(e) - scan_index_errors.append(msg) - logger.error(msg) - return scan_index_errors - - -def _update_package_checksums(package, scan_object): - """ - Update and save `package` Package checksums with data from `scan_object` Scan. - - Return True if the package was updated. - """ - updated = False - if ((package.sha1 and package.sha1 != scan_object.sha1) or - (package.md5 and package.md5 != scan_object.md5) or - (package.size and package.size != scan_object.size)): - raise Exception( - 'Inconsistent checksum or size collected from scan uuid: {} for Package {}' - .format(scan_object.uuid, package.uuid) - ) - - if not package.sha1: - package.sha1 = scan_object.sha1 - updated = True - if not package.md5: - package.md5 = scan_object.md5 - updated = True - if not package.size: - package.size = scan_object.size - updated = True - if updated: - package.save() - return updated - - -def index_package_files(package, scan_data, reindex=False): - """ - Index scan data for `package` Package. - - Return a list of scan index errors messages - - If `reindex` is True, then all fingerprints related to `package` will be - deleted and recreated from `scan_data`. - """ - if reindex: - logger.info(f'Deleting fingerprints and Resources related to {package.package_url}') - package.approximatedirectorycontentindex_set.all().delete() - package.approximatedirectorystructureindex_set.all().delete() - package.exactfileindex_set.all().delete() - package.resources.all().delete() - - scan_index_errors = [] - try: - logger.info(f'Indexing Resources and fingerprints related to {package.package_url} from scan data') - for resource in scan_data.get('files', []): - r, _, _ = merge_or_create_resource(package, resource) - path = r.path - sha1 = r.sha1 - if sha1: - _, _ = ExactFileIndex.index( - sha1=sha1, - package=package - ) - - resource_extra_data = resource.get('extra_data', {}) - directory_content_fingerprint = resource_extra_data.get('directory_content', '') - directory_structure_fingerprint = resource_extra_data.get('directory_structure', '') - - if directory_content_fingerprint: - _, _ = ApproximateDirectoryContentIndex.index( - directory_fingerprint=directory_content_fingerprint, - resource_path=path, - package=package, - ) - if directory_structure_fingerprint: - _, _ = ApproximateDirectoryStructureIndex.index( - directory_fingerprint=directory_structure_fingerprint, - resource_path=path, - package=package, - ) - - except Exception as e: - msg = get_error_message(e) - scan_index_errors.append(msg) - logger.error(msg) - - return scan_index_errors diff --git a/minecode/management/commands/request_scans.py b/minecode/management/commands/request_scans.py deleted file mode 100644 index f781b1b9..00000000 --- a/minecode/management/commands/request_scans.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import logging -import signal -import sys - -from django.db import transaction -from django.db.models import Q -from django.utils import timezone - -from minecode.management import scanning -from minecode.management.commands import get_error_message -from minecode.models import ScannableURI - - -logger = logging.getLogger(__name__) -logging.basicConfig(stream=sys.stdout) -logger.setLevel(logging.INFO) - - -class Command(scanning.ScanningCommand): - logger = logger - - help = 'Request scans for ScannableURIs from scancode.io.' - - def add_arguments(self, parser): - parser.add_argument( - '--max-scan-requests', - dest='max_scan_requests', - action='store', - help='Limit the number of scan requests that can be made', - default=3, - ) - - def handle(self, *args, **options): - self.logger.setLevel(self.get_verbosity(**options)) - scanning.ScanningCommand.handle(self, *args, **options) - - @classmethod - def get_next_uri(self): - with transaction.atomic(): - scannable_uri = ScannableURI.objects.get_next_scannable() - return scannable_uri - - @classmethod - def process_scan(cls, scannable_uri, options, response_save_loc='', **kwargs): - """ - Request a ScanCode.io scan for a `scannable_uri` ScannableURI. - """ - uri = scannable_uri.uri - max_scan_requests = options.get('max_scan_requests', 3) - if isinstance(max_scan_requests, int): - submitted_and_in_progress = ScannableURI.objects.filter( - Q(scan_status=ScannableURI.SCAN_SUBMITTED) | Q(scan_status=ScannableURI.SCAN_IN_PROGRESS) - ).count() - - if submitted_and_in_progress >= max_scan_requests: - cls.logger.info(f'Max scan requests reached: {max_scan_requests} Skipping URI "{uri}"') - return - - scan_errors = [] - scancodeio_uuid = scan_error = None - - try: - cls.logger.info('Requesting scan from ScanCode.io for URI: "{uri}"'.format(**locals())) - scan = scanning.submit_scan( - uri, - scannable_uri.package, - api_url=cls.api_url, - api_auth_headers=cls.api_auth_headers, - response_save_loc=response_save_loc - ) - scancodeio_uuid = scan.uuid - - except Exception as e: - msg = 'Scan request error for URI: "{uri}"'.format(**locals()) - msg += '\n'.format(scannable_uri.uri) - msg += get_error_message(e) - scan_errors.append(msg) - cls.logger.error(msg) - - finally: - # Flag the processed scannable_uri as completed - scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED - scannable_uri.scan_request_date = timezone.now() - scannable_uri.scan_uuid = scancodeio_uuid - scannable_uri.wip_date = None - - if scan_errors: - cls.logger.debug(' ! Scan request errors.') - scannable_uri.scan_error = '\n'.join(scan_errors)[:5000] - else: - cls.logger.debug(' + Scan requested OK.') - - scannable_uri.save() - - -# support graceful death when used as a service -signal.signal(signal.SIGTERM, Command.stop_handler) diff --git a/minecode/management/indexing.py b/minecode/management/indexing.py new file mode 100644 index 00000000..45e15e8c --- /dev/null +++ b/minecode/management/indexing.py @@ -0,0 +1,135 @@ +# +# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# +from matchcode.models import ApproximateDirectoryContentIndex +from matchcode.models import ApproximateDirectoryStructureIndex +from matchcode.models import ExactFileIndex +from minecode.management.commands import get_error_message +import logging +import sys +from minecode.model_utils import merge_or_create_resource +from packagedcode.utils import combine_expressions +from licensedcode.cache import build_spdx_license_expression +import traceback +from minecode.models import ScannableURI + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + + +def index_package_files(package, scan_data, reindex=False): + """ + Index scan data for `package` Package. + + Return a list of scan index errors messages + + If `reindex` is True, then all fingerprints related to `package` will be + deleted and recreated from `scan_data`. + """ + if reindex: + logger.info(f'Deleting fingerprints and Resources related to {package.package_url}') + package.approximatedirectorycontentindex_set.all().delete() + package.approximatedirectorystructureindex_set.all().delete() + package.exactfileindex_set.all().delete() + package.resources.all().delete() + + scan_index_errors = [] + try: + logger.info(f'Indexing Resources and fingerprints related to {package.package_url} from scan data') + for resource in scan_data.get('files', []): + r, _, _ = merge_or_create_resource(package, resource) + path = r.path + sha1 = r.sha1 + if sha1: + _, _ = ExactFileIndex.index( + sha1=sha1, + package=package + ) + + resource_extra_data = resource.get('extra_data', {}) + directory_content_fingerprint = resource_extra_data.get('directory_content', '') + directory_structure_fingerprint = resource_extra_data.get('directory_structure', '') + + if directory_content_fingerprint: + _, _ = ApproximateDirectoryContentIndex.index( + directory_fingerprint=directory_content_fingerprint, + resource_path=path, + package=package, + ) + if directory_structure_fingerprint: + _, _ = ApproximateDirectoryStructureIndex.index( + directory_fingerprint=directory_structure_fingerprint, + resource_path=path, + package=package, + ) + + except Exception as e: + msg = get_error_message(e) + scan_index_errors.append(msg) + logger.error(msg) + + return scan_index_errors + + +def index_package(scannable_uri, package, scan_data, summary_data, reindex=False): + scan_index_errors = [] + try: + indexing_errors = index_package_files(package, scan_data, reindex=reindex) + scan_index_errors.extend(indexing_errors) + declared_license_expression = summary_data.get('declared_license_expression') + declared_license_expression_spdx = None + if declared_license_expression: + declared_license_expression_spdx = build_spdx_license_expression(declared_license_expression) + + other_license_expressions = summary_data.get('other_license_expressions', []) + other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] + other_license_expression = combine_expressions(other_license_expressions) + + copyright = '' + declared_holder = summary_data.get('declared_holder') + if declared_holder: + copyright = f'Copyright (c) {declared_holder}' + + values_by_updateable_fields = { + 'summary': summary_data, + 'declared_license_expression': declared_license_expression, + 'declared_license_expression_spdx': declared_license_expression_spdx, + 'other_license_expression': other_license_expression, + 'copyright': copyright, + } + + updated_fields = [] + for field, value in values_by_updateable_fields.items(): + p_val = getattr(package, field) + if ( + (not p_val and value) + or reindex + ): + setattr(package, field, value) + entry = dict( + field=field, + old_value=p_val, + new_value=value, + ) + updated_fields.append(entry) + + if updated_fields: + data = { + 'updated_fields': updated_fields, + } + package.append_to_history( + 'Package field values have been updated.', + data=data, + save=True, + ) + + scannable_uri.scan_status = ScannableURI.SCAN_INDEXED + except Exception as e: + traceback_message = traceback.format_exc() + error_message = traceback_message + '\n' + # TODO: We should rerun the specific indexers that have failed + if scan_index_errors: + error_message += '\n'.join(scan_index_errors) + scannable_uri.index_error = error_message + scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED diff --git a/minecode/management/scanning.py b/minecode/management/scanning.py deleted file mode 100644 index 9639649e..00000000 --- a/minecode/management/scanning.py +++ /dev/null @@ -1,482 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# -from uuid import uuid4 -import hashlib -import logging -import sys -import time -import json - -import attr -import requests - -from django.conf import settings - -from minecode.management.commands import VerboseCommand - -logger = logging.getLogger(__name__) -logging.basicConfig(stream=sys.stdout) -logger.setLevel(logging.INFO) - -# sleep duration in seconds when the queue is empty -SLEEP_WHEN_EMPTY = 1 - -# in seconds -REQUEST_TIMEOUT = 120 - -# Only SCANCODEIO_URL can be provided through setting -SCANCODEIO_URL = settings.SCANCODEIO_URL -SCANCODEIO_API_URL = f'{SCANCODEIO_URL.rstrip("/")}/api/' if SCANCODEIO_URL else None -SCANCODEIO_API_URL_PROJECTS = f'{SCANCODEIO_API_URL}projects/' if SCANCODEIO_API_URL else None - -# Authentication with single API key -SCANCODEIO_API_KEY = settings.SCANCODEIO_API_KEY -SCANCODEIO_AUTH_HEADERS = { - 'Authorization': f'Token {SCANCODEIO_API_KEY}' -} if SCANCODEIO_API_KEY else {} - - -@attr.attrs(slots=True) -class Scan(object): - """ - Represent a scan record as returned by the ScanCode.io API /scans/ - endpoint. - """ - # this is the API endpoint full URL: - # "url": "https://scancode.io/api/scans/ac85c2f0-09b9-4ca1-b0e4-91523a636ccf/", - url = attr.ib(default=None) - # this is the UUDI for for scan: - # "uuid": "ac85c2f0-09b9-4ca1-b0e4-91523a636ccf", - uuid = attr.ib(default=None) - # The UUID for the scan run - run_uuid = attr.ib(default=None) - # the actual URI being scanned: - # "uri": "https://repo1.maven.org/maven2/io/github/subiyacryolite/jds/3.0.1/jds-3.0.1-sources.jar", - uri = attr.ib(default=None) - # set at creation of a scan request - # "created_date": "2018-06-19T08:33:34.953429Z", - created_date = attr.ib(default=None) - # set at start of the actual fetch+scan: - # "task_start_date": null, - task_start_date = attr.ib(default=None) - # set at end of scanning: - # "task_end_date": null, - task_end_date = attr.ib(default=None) - # null and then 0 on success or 1 or else on failure - task_exitcode = attr.ib(default=None) - # ignore for now - # task_output=attr.ib(default=None) - # "status": 'not started yet', 'failed', 'in progress', 'completed' - status = attr.ib(default=None) - # as a time stamp - execution_time = attr.ib(default=None) - md5 = attr.ib(default=None) - sha1 = attr.ib(default=None) - sha256 = attr.ib(default=None) - sha512 = attr.ib(default=None) - sha1_git = attr.ib(default=None) - filename = attr.ib(default=None) - size = attr.ib(default=None) - - @classmethod - def from_response(cls, url, uuid, runs, input_sources, extra_data={}, **kwargs): - """ - Return a Scan object built from an API response data arguments. - """ - run_data = {} - if len(runs) > 0: - run_data = runs[0] - - run_uuid = run_data.get("uuid") - created_date = run_data.get("created_date") - task_start_date = run_data.get("task_start_date") - task_end_date = run_data.get("task_end_date") - task_exitcode = run_data.get("task_exitcode") - status = run_data.get("status") - execution_time = run_data.get('execution_time') - - if len(input_sources) > 0: - uri = input_sources[0]["download_url"] - - md5 = extra_data.get('md5') - sha1 = extra_data.get('sha1') - sha256 = extra_data.get('sha256') - sha512 = extra_data.get('sha512') - sha1_git = extra_data.get('sha1_git') - filename = extra_data.get('filename') - size = extra_data.get('size') - - return Scan( - url=url, uuid=uuid, run_uuid=run_uuid, uri=uri, - created_date=created_date, task_start_date=task_start_date, - task_end_date=task_end_date, task_exitcode=task_exitcode, - status=status, execution_time=execution_time, - md5=md5, sha1=sha1, sha256=sha256, sha512=sha512, - sha1_git=sha1_git, filename=filename, size=size - ) - - @property - def results_url(self): - url = self.url.rstrip('/') - return f'{url}/results/' - - @property - def not_started(self): - return self.status == 'not_started' - - @property - def queued(self): - return self.status == 'queued' - - @property - def running(self): - return self.status == 'running' - - @property - def success(self): - return self.status == 'success' - - @property - def failure(self): - return self.status == 'failure' - - @property - def stopped(self): - return self.status == 'stopped' - - @property - def stale(self): - return self.status == 'stale' - - -def uri_fingerprint(uri): - """ - Return the SHA1 hex digest of `uri` - """ - encoded_uri = uri.encode('utf-8') - return hashlib.sha1(encoded_uri).hexdigest() - - -def query_scans(uri, api_url=SCANCODEIO_API_URL_PROJECTS, api_auth_headers=SCANCODEIO_AUTH_HEADERS, response_save_loc=''): - """ - Return scan information for `uri` if `uri` has already been scanned by ScanCode.io - """ - payload = {'name': uri_fingerprint(uri)} - response = requests.get(url=api_url, params=payload, headers=api_auth_headers) - response_json = response.json() - if response_save_loc: - with open(response_save_loc, 'w') as f: - json.dump(response_json, f) - if not response.ok: - response.raise_for_status() - results = response_json['results'] - if results and len(results) == 1: - return results[0] - - -def submit_scan( - uri, - package, - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - response_save_loc='' -): - """ - Submit a scan request for `uri` to ScanCode.io and return a Scan object on - success. Raise an exception on error. - """ - logger.debug('submit_scan: uri', uri, 'api_url:', api_url, 'api_auth_headers:', api_auth_headers) - package_name = package.name - package_version = package.version - uuid = uuid4() - uuid_str = str(uuid) - uuid_segments = uuid_str.split('-') - uuid_segment = uuid_segments[-1] - - if package_version: - project_name = f'{package_name}-{package_version}-{uuid_segment}' - else: - project_name = f'{package.name}-{uuid_segment}' - - request_args = { - 'name': project_name, - 'pipeline': 'scan_and_fingerprint_package', - 'input_urls': [ - uri - ], - 'execute_now': True - } - - response = requests.post(url=api_url, data=request_args, headers=api_auth_headers) - try: - response_json = response.json() - except json.decoder.JSONDecodeError as e: - msg = f'Error occured when decoding scan request response: {e}' - raise Exception(msg) - if response_save_loc: - with open(response_save_loc, 'w') as f: - json.dump(response_json, f) - - if not response.ok: - if response.status_code == requests.codes.bad_request: - name = response_json.get('name') - if name and 'project with this name already exists.' in name: - query_results = query_scans(uri, api_url=api_url, api_auth_headers=api_auth_headers) - if query_results: - scan = Scan.from_response(**query_results) - else: - response.raise_for_status() - else: - scan = Scan.from_response(**response_json) - uuid = scan.uuid - if not uuid: - msg = 'Failed to to submit scan UUID for URI: "{uri}".\n'.format(**locals()) - msg += repr(response_json) - raise Exception(msg) - return scan - - -def get_scan_url(scan_uuid, api_url=SCANCODEIO_API_URL_PROJECTS, suffix=''): - """ - Return a scancode.io scan API URL built from the Scan UUID `scan_uuid` or - None. Return the basic URL to get scan request information. Optionally adds - a `suffix` (such as /data or /summary) to get scans data. - - For example: - https://scancode.io/api/projects/b15f2dcb-46ef-43e1-b5e3-563871ce59cc/ - """ - - base_url = api_url and api_url.rstrip('/') or '' - url = f'{base_url}/{scan_uuid}/{suffix}' - # scancode.io seems to demand a trailing slash - url = url.rstrip('/') - url = url + '/' - return url - - -def _call_scan_get_api( - scan_uuid, - endpoint='', - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - timeout=REQUEST_TIMEOUT, -): - """ - Send a get request to the scan API for `scan_uuid` and return response - mapping from a JSON response. Call either the plain scan enpoint or the data - or summary endpoints based on the value of the `endpoint `arg. Raise an - exception on error. - """ - scan_url = get_scan_url(scan_uuid, api_url=api_url, suffix=endpoint) - try: - response = requests.get(url=scan_url, timeout=timeout, headers=api_auth_headers) - if not response.ok: - response.raise_for_status() - except Exception: - # Ensure that exceptions are passed up the call stack, so they can be - # caught when _call_scan_get_api is called by another function. - raise - return response.json() - - -def _get_scan_info( - scan_uuid, - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - timeout=REQUEST_TIMEOUT, - get_scan_info_save_loc='' -): - """ - Return a mapping of project info for `scan_uuid` fetched from ScanCode.io or None. - Raise an exception on error. - """ - results = _call_scan_get_api( - scan_uuid, - endpoint='', - api_url=api_url, - api_auth_headers=api_auth_headers, - timeout=timeout - ) - if get_scan_info_save_loc: - with open(get_scan_info_save_loc, 'w') as f: - json.dump(results, f) - return results - - -def get_scan_info( - scan_uuid, - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - timeout=REQUEST_TIMEOUT, - get_scan_info_save_loc='' -): - """ - Return a Scan object for `scan_uuid` fetched from ScanCode.io or None. - Raise an exception on error. - """ - results = _get_scan_info( - scan_uuid=scan_uuid, - api_url=api_url, - api_auth_headers=api_auth_headers, - timeout=timeout, - get_scan_info_save_loc=get_scan_info_save_loc, - ) - return Scan.from_response(**results) - - -def get_scan_data( - scan_uuid, - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - timeout=REQUEST_TIMEOUT, - get_scan_data_save_loc='' -): - """ - Return scan details data as a mapping for a `scan_uuid` fetched from - ScanCode.io or None. Raise an exception on error. - """ - # FIXME: we should return a temp location instead - results = _call_scan_get_api( - scan_uuid, - endpoint='results', - api_url=api_url, - api_auth_headers=api_auth_headers, - timeout=timeout - ) - if get_scan_data_save_loc: - with open(get_scan_data_save_loc, 'w') as f: - json.dump(results, f) - return results - - -def get_scan_summary( - scan_uuid, - api_url=SCANCODEIO_API_URL_PROJECTS, - api_auth_headers=SCANCODEIO_AUTH_HEADERS, - timeout=REQUEST_TIMEOUT, - get_scan_data_save_loc='' -): - """ - Return scan summary data as a mapping for a `scan_uuid` fetched from - ScanCode.io or None. Raise an exception on error. - """ - # FIXME: we should return a temp location instead - results = _call_scan_get_api( - scan_uuid, - endpoint='summary', - api_url=api_url, - api_auth_headers=api_auth_headers, - timeout=timeout - ) - if get_scan_data_save_loc: - with open(get_scan_data_save_loc, 'w') as f: - json.dump(results, f) - return results - - -class ScanningCommand(VerboseCommand): - """ - Base command class for processing ScannableURIs. - """ - # subclasses must override - logger = None - - api_url = SCANCODEIO_API_URL_PROJECTS - - api_auth_headers = SCANCODEIO_AUTH_HEADERS - - def add_arguments(self, parser): - parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', - default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') - - parser.add_argument( - '--max-uris', - dest='max_uris', - default=0, - action='store', - help='Limit the number of Scannable URIs processed to a maximum number. ' - '0 means no limit. Used only for testing.') - - def handle(self, *args, **options): - exit_on_empty = options.get('exit_on_empty') - max_uris = options.get('max_uris', 0) - - uris_counter = self.process_scans( - exit_on_empty=exit_on_empty, - max_uris=max_uris, - # Pass options to allow subclasses to add their own options - options=options - ) - self.stdout.write('Processed {} ScannableURI.'.format(uris_counter)) - - @classmethod - def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): - """ - Run an infinite scan processing loop. Return a processed URis count. - - Get the next available candidate ScannableURI and request a scan from - ScanCode.io. Loops forever and sleeps a short while if there are no - ScannableURI left to scan. - """ - uris_counter = 0 - sleeping = False - - while True: - # Wait before processing anything - time.sleep(3) - - if cls.MUST_STOP: - cls.logger.info('Graceful exit of the scan processing loop.') - break - - if max_uris and uris_counter >= max_uris: - cls.logger.info('max_uris requested reached: exiting scan processing loop.') - break - - scannable_uri = cls.get_next_uri() - - if not scannable_uri: - if exit_on_empty: - cls.logger.info('exit-on-empty requested: No more scannable URIs, exiting...') - break - - # Only log a single message when we go to sleep - if not sleeping: - sleeping = True - cls.logger.info('No more scannable URIs, sleeping for at least {} seconds...'.format(SLEEP_WHEN_EMPTY)) - - time.sleep(SLEEP_WHEN_EMPTY) - continue - - cls.logger.info('Processing scannable URI: {}'.format(scannable_uri)) - - cls.process_scan(scannable_uri, **kwargs) - uris_counter += 1 - sleeping = False - - return uris_counter - - @classmethod - def get_next_uri(self): - """ - Return a locked ScannableURI for processing. - Subclasses must implement - - Typically something like: - with transaction.atomic(): - scannable_uri = ScannableURI.objects.get_next_scannable() - """ - pass - - @classmethod - def process_scan(scannable_uri, **kwargs): - """ - Process a single `scannable_uri` ScannableURI. Subclasses must implement. - If sucessfully processed the ScannableURI must be updated accordingly. - """ - pass diff --git a/minecode/model_utils.py b/minecode/model_utils.py index b5265428..78a966eb 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -24,7 +24,7 @@ logger.setLevel(logging.INFO) -def add_package_to_scan_queue(package, reindex_uri=False): +def add_package_to_scan_queue(package, reindex_uri=False, priority=0): """ Add a Package `package` to the scan queue """ @@ -33,6 +33,7 @@ def add_package_to_scan_queue(package, reindex_uri=False): uri=uri, package=package, reindex_uri=reindex_uri, + priority=priority, ) if scannable_uri_created: logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) diff --git a/minecode/tests/create_scanning_fixtures.py b/minecode/tests/create_scanning_fixtures.py deleted file mode 100644 index 831ee038..00000000 --- a/minecode/tests/create_scanning_fixtures.py +++ /dev/null @@ -1,60 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os - -from django.test import TestCase - -from minecode.models import ScannableURI -from packagedb.models import Package -from minecode.management import scanning -from minecode.utils_test import JsonBasedTesting -from minecode.management.commands.request_scans import Command as RequestScansCommand -from minecode.management.commands.process_scans import Command as ProcessScansCommand - - -class ScanCodeIOAPIHelperFunctionTest(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - self.package1, _ = Package.objects.get_or_create( - download_url='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - type='maven', - namespace='', - name='wagon-api', - version='20040705.181715', - ) - self.scannable_uri1, _ = ScannableURI.objects.get_or_create( - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - package=self.package1 - ) - - def generate_api_responses(self): - scan_request_response_loc = self.get_test_loc('scancodeio/scan_request_response.json') - RequestScansCommand.process_scan( - self.scannable_uri1, - response_save_loc=scan_request_response_loc, - options={} - ) - get_scan_info_save_loc = self.get_test_loc('scancodeio/get_scan_info.json') - get_scan_data_save_loc = self.get_test_loc('scancodeio/get_scan_data.json') - ProcessScansCommand.process_scan( - self.scannable_uri1, - get_scan_info_save_loc=get_scan_info_save_loc, - get_scan_data_save_loc=get_scan_data_save_loc - ) - scan_exists_for_uri_save_loc = self.get_test_loc('scancodeio/scan_exists_for_uri.json') - RequestScansCommand.process_scan( - self.scannable_uri1, - response_save_loc=scan_exists_for_uri_save_loc, - options={} - ) - scan_request_lookup_loc = self.get_test_loc('scancodeio/scan_request_lookup.json') - response = scanning.query_scans(uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', response_save_loc=scan_request_lookup_loc) diff --git a/minecode/tests/test_process_scans.py b/minecode/tests/test_indexing.py similarity index 54% rename from minecode/tests/test_process_scans.py rename to minecode/tests/test_indexing.py index 442efdcd..f4ccc236 100644 --- a/minecode/tests/test_process_scans.py +++ b/minecode/tests/test_indexing.py @@ -10,14 +10,9 @@ import json import os -from mock import Mock -from mock import patch - from matchcode.models import ExactFileIndex -from minecode.management.commands.process_scans import Command -from minecode.management.commands.process_scans import get_scan_status -from minecode.management.commands.process_scans import index_package_files -from minecode.management.scanning import Scan + +from minecode.management import indexing from minecode.models import ScannableURI from minecode.utils_test import MiningTestCase from minecode.utils_test import JsonBasedTesting @@ -25,7 +20,7 @@ from packagedb.models import Resource -class ProcessScansTest(MiningTestCase, JsonBasedTesting): +class IndexingTest(MiningTestCase, JsonBasedTesting): BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): @@ -37,67 +32,35 @@ def setUp(self): version='20040705.181715' ) - def test_ProcessScansTest_get_scan_status(self): - scan = Scan() - scan.status = 'not_started' - self.assertEqual(ScannableURI.SCAN_SUBMITTED, get_scan_status(scan)) - scan.status = 'queued' - self.assertEqual(ScannableURI.SCAN_SUBMITTED, get_scan_status(scan)) - scan.status = 'running' - self.assertEqual(ScannableURI.SCAN_IN_PROGRESS, get_scan_status(scan)) - scan.status = 'success' - self.assertEqual(ScannableURI.SCAN_COMPLETED, get_scan_status(scan)) - scan.status = 'failure' - self.assertEqual(ScannableURI.SCAN_FAILED, get_scan_status(scan)) - scan.status = 'stopped' - self.assertEqual(ScannableURI.SCAN_FAILED, get_scan_status(scan)) - scan.status = 'stale' - self.assertEqual(ScannableURI.SCAN_FAILED, get_scan_status(scan)) - scan.status = 'asdf' - self.assertRaises(Exception, get_scan_status, scan) - def test_ProcessScansTest_index_package_files(self): scan_data_loc = self.get_test_loc('scancodeio/get_scan_data.json') with open(scan_data_loc, 'rb') as f: scan_data = json.loads(f.read()) - self.assertEqual(0, len(index_package_files(self.package1, scan_data))) + self.assertEqual(0, len(indexing.index_package_files(self.package1, scan_data))) result = Resource.objects.filter(package=self.package1) self.assertEqual(64, len(result)) results = [r.to_dict() for r in result] expected_resources_loc = self.get_test_loc('scancodeio/get_scan_data_expected_resources.json') self.check_expected_results(results, expected_resources_loc, regen=False) - @patch('requests.get') - def test_ProcessScansTest_process_scan(self, mock_get): - # Set up mock responses - mock_scan_info_response = Mock() - scan_info_loc = self.get_test_loc('scancodeio/get_scan_info.json') - with open(scan_info_loc, 'rb') as f: - mock_scan_info_response.json.return_value = json.loads(f.read()) - - mock_scan_data_response = Mock() + def test_ProcessScansTest_process_scan(self): scan_data_loc = self.get_test_loc('scancodeio/get_scan_data.json') with open(scan_data_loc, 'rb') as f: - mock_scan_data_response.json.return_value = json.loads(f.read()) + scan_data = json.load(f) - mock_scan_summary_response = Mock() scan_summary_loc = self.get_test_loc('scancodeio/scan_summary_response.json') with open(scan_summary_loc, 'rb') as f: - mock_scan_summary_response.json.return_value = json.loads(f.read()) - - mock_get.side_effect = [mock_scan_info_response, mock_scan_data_response, mock_scan_summary_response] + scan_summary = json.load(f) # Set up ScannableURI - scan_uuid = '54dc4afe-70ea-4f1c-9ed3-989efd9a991f' scannable_uri = ScannableURI.objects.create( uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - scan_uuid=scan_uuid, scan_status=ScannableURI.SCAN_COMPLETED, package=self.package1 ) # Run test - Command.process_scan(scannable_uri) + indexing.index_package(scannable_uri, self.package1, scan_data, scan_summary) # Make sure that we get license_expression and copyright from the summary self.assertEqual('apache-2.0', self.package1.declared_license_expression) @@ -107,4 +70,3 @@ def test_ProcessScansTest_process_scan(self, mock_get): self.assertEqual(64, result.count()) result = ExactFileIndex.objects.filter(package=self.package1) self.assertEqual(45, result.count()) - diff --git a/minecode/tests/test_request_scans.py b/minecode/tests/test_request_scans.py deleted file mode 100644 index 4cbf1ad1..00000000 --- a/minecode/tests/test_request_scans.py +++ /dev/null @@ -1,73 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os - -from mock import Mock -from mock import patch - -from django.db.models import Q - -from minecode.management.commands.request_scans import Command -from minecode.utils_test import MiningTestCase -from packagedb.models import Package -from minecode.models import ScannableURI - - -class RequestScansTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - self.package1 = Package.objects.create( - download_url='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - namespace='', - name='wagon-api', - version='20040705.181715' - ) - - @patch('requests.post') - def test_RequestScansTest_request_scan(self, mock_post): - # Set up mock responses - mock_scan_request_response = Mock() - scan_request_loc = self.get_test_loc('scancodeio/scan_request_response.json') - with open(scan_request_loc, 'rb') as f: - mock_scan_request_response.json.return_value = json.loads(f.read()) - - mock_post.side_effect = [mock_scan_request_response] - - # Set up ScannableURI - scannable_uri1 = ScannableURI.objects.create( - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - scan_status=ScannableURI.SCAN_NEW, - package=self.package1 - ) - - for scannable_uri in ScannableURI.objects.all(): - # Run test - Command.process_scan(scannable_uri, options={}) - - result = ScannableURI.objects.get(uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar') - self.assertEqual(ScannableURI.SCAN_SUBMITTED, result.scan_status) - - def test_RequestScansTest_limit_scan_request(self): - # Set up Package and ScannableURI - package1 = Package.objects.create(download_url='example.com', name='Foo', version='1.23') - scannable_uri1 = ScannableURI.objects.create( - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - scan_status=ScannableURI.SCAN_NEW, - package=package1 - ) - - for scannable_uri in ScannableURI.objects.all(): - # Run test, no API call should be made because `max_scan_requests` is 0 - Command.process_scan(scannable_uri, options={'max_scan_requests': 0}) - - result = ScannableURI.objects.get(uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar') - self.assertEqual(ScannableURI.SCAN_NEW, result.scan_status) diff --git a/minecode/tests/test_scanning.py b/minecode/tests/test_scanning.py deleted file mode 100644 index 632e4e3b..00000000 --- a/minecode/tests/test_scanning.py +++ /dev/null @@ -1,194 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os - -import attr -import mock - -from django.test import TestCase as DjangoTestCase - -from minecode.management import scanning -from minecode.utils_test import JsonBasedTesting -from packagedb.models import Package - - -class ScanCodeIOAPIHelperFunctionTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - self.package1 = Package.objects.create( - type='maven', - namespace='maven', - name='wagon-api', - version='20040705.181715', - download_url='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - ) - - @mock.patch('requests.get') - def testscanning_query_scans(self, mock_get): - mock_get.return_value = mock.Mock(ok=True) - scan_info_response_loc = self.get_test_loc('scancodeio/scan_request_lookup.json') - with open(scan_info_response_loc, 'rb') as f: - mock_get.return_value.json.return_value = json.loads(f.read()) - - api_url = 'http://127.0.0.1:8001/api/' - api_auth_headers = {} - uri = 'https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar' - response = scanning.query_scans(uri=uri, api_url=api_url, api_auth_headers=api_auth_headers) - result = scanning.Scan.from_response(**response) - - expected = scanning.Scan( - url='http://127.0.0.1:8001/api/projects/c3b8d1ab-4811-4ced-84af-080997ef1a1a/', - uuid='c3b8d1ab-4811-4ced-84af-080997ef1a1a', - run_uuid='336e18e3-fd68-4375-9bf2-87090dc5c726', - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - created_date='2023-05-19T00:45:29.451929Z', - task_start_date='2023-05-19T00:45:29.461599Z', - task_end_date='2023-05-19T00:45:39.251824Z', - task_exitcode=0, - status='success', - execution_time=9 - ) - result = attr.asdict(result) - expected = attr.asdict(expected) - self.assertEqual(expected, result) - - @mock.patch('requests.post') - def testscanning_submit_scan(self, mock_post): - test_loc = self.get_test_loc('scancodeio/scan_request_response.json') - mock_post.return_value = mock.Mock(ok=True) - with open(test_loc, 'rb') as f: - mock_post.return_value.json.return_value = json.loads(f.read()) - api_url = 'http://127.0.0.1:8001/api/' - api_auth_headers = {} - uri = 'https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar' - result = scanning.submit_scan( - uri=uri, - package=self.package1, - api_url=api_url, - api_auth_headers=api_auth_headers - ) - expected = scanning.Scan( - url='http://127.0.0.1:8001/api/projects/c3b8d1ab-4811-4ced-84af-080997ef1a1a/', - uuid='c3b8d1ab-4811-4ced-84af-080997ef1a1a', - run_uuid='336e18e3-fd68-4375-9bf2-87090dc5c726', - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - created_date='2023-05-19T00:45:29.451929Z', - task_start_date=None, - task_end_date=None, - task_exitcode=None, - status='not_started', - execution_time=None, - ) - expected = attr.asdict(expected) - result = attr.asdict(result) - self.assertEqual(expected, result) - - @mock.patch('requests.post') - @mock.patch('requests.get') - def testscanning_submit_scan_uri_exists(self, mock_post, mock_get): - self.maxDiff = None - mock_post.return_value = mock.Mock(ok=False) - scan_request_response_loc = self.get_test_loc('scancodeio/scan_exists_for_uri.json') - with open(scan_request_response_loc, 'rb') as f: - mock_post.return_value.json.return_value = json.loads(f.read()) - - mock_get.return_value = mock.Mock(ok=True) - scan_info_response_loc = self.get_test_loc('scancodeio/scan_request_response.json') - with open(scan_info_response_loc, 'rb') as f: - mock_get.return_value.json.return_value = json.loads(f.read()) - - api_url = 'http://127.0.0.1:8001/api/' - api_auth_headers = {} - uri = 'https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar' - result = scanning.submit_scan( - uri=uri, - package=self.package1, - api_url=api_url, - api_auth_headers=api_auth_headers - ) - - expected = scanning.Scan( - url='http://127.0.0.1:8001/api/projects/c3b8d1ab-4811-4ced-84af-080997ef1a1a/', - uuid='c3b8d1ab-4811-4ced-84af-080997ef1a1a', - run_uuid='336e18e3-fd68-4375-9bf2-87090dc5c726', - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - created_date='2023-05-19T00:45:29.451929Z', - task_start_date=None, - task_end_date=None, - task_exitcode=None, - status='not_started', - execution_time=None, - ) - - expected = attr.asdict(expected) - result = attr.asdict(result) - self.assertEqual(expected, result) - - def testscanning_get_scan_url(self): - scan_uuid = '177eb27a-25d2-4ef0-b608-5a84ea9b1ef1' - api_url_projects = 'http://127.0.0.1:8001/api/projects/' - suffix = 'results' - result = scanning.get_scan_url(scan_uuid=scan_uuid, api_url=api_url_projects) - expected = 'http://127.0.0.1:8001/api/projects/177eb27a-25d2-4ef0-b608-5a84ea9b1ef1/' - self.assertEqual(expected, result) - result_with_suffix = scanning.get_scan_url(scan_uuid=scan_uuid, api_url=api_url_projects, suffix=suffix) - expected_with_suffix = 'http://127.0.0.1:8001/api/projects/177eb27a-25d2-4ef0-b608-5a84ea9b1ef1/results/' - self.assertEqual(expected_with_suffix, result_with_suffix) - - @mock.patch('requests.get') - def testscanning_get_scan_info(self, mock_get): - test_loc = self.get_test_loc('scancodeio/get_scan_info.json') - mock_get.return_value = mock.Mock(ok=True) - with open(test_loc, 'rb') as f: - mock_get.return_value.json.return_value = json.loads(f.read()) - scan_uuid = '54dc4afe-70ea-4f1c-9ed3-989efd9a991f' - api_url = 'http://127.0.0.1:8001/api/' - api_auth_headers = {} - result = scanning.get_scan_info(scan_uuid=scan_uuid, api_url=api_url, api_auth_headers=api_auth_headers) - expected = scanning.Scan( - url='http://127.0.0.1:8001/api/projects/c3b8d1ab-4811-4ced-84af-080997ef1a1a/', - uuid='c3b8d1ab-4811-4ced-84af-080997ef1a1a', - run_uuid='336e18e3-fd68-4375-9bf2-87090dc5c726', - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - created_date='2023-05-19T00:45:29.451929Z', - task_start_date='2023-05-19T00:45:29.461599Z', - task_end_date='2023-05-19T00:45:39.251824Z', - task_exitcode=0, - status='success', - execution_time=9, - md5='57431f2f6d5841eebdb964b04091b8ed', - sha1='feff0d7bacd11d37a9c96daed87dc1db163065b1', - sha256='05155c2c588ac5922d930eeb1e8a1da896956f4696ae758d110708e9f095baba', - sha512='4431f237bcdfee5d2b86b1b3f01c8abaa160d5b7007c63e6281845a3f920d89fdb2e4044f97694ddef91e174d9dd30e5016bbad46eec2d68af200a47e9cedd85', - sha1_git='ad18d88bdae8449e7c170f8e7db1bfe336dbb4e0', - filename='wagon-api-20040705.181715.jar', - size=47069, - ) - expected = attr.asdict(expected) - result = attr.asdict(result) - self.assertEqual(expected, result) - - @mock.patch('requests.get') - def testscanning_get_scan_data(self, mock_get): - test_loc = self.get_test_loc('scancodeio/get_scan_data.json') - mock_get.return_value = mock.Mock(ok=True) - with open(test_loc, 'rb') as f: - mock_get.return_value.json.return_value = json.loads(f.read()) - scan_uuid = '54dc4afe-70ea-4f1c-9ed3-989efd9a991f' - api_url = 'http://127.0.0.1:8001/api/' - api_auth_headers = {} - expected_loc = self.get_test_loc('scancodeio/get_scan_data_expected.json') - result = scanning.get_scan_data(scan_uuid=scan_uuid, api_url=api_url, api_auth_headers=api_auth_headers) - with open(expected_loc, 'rb') as f: - expected = json.loads(f.read()) - self.assertEqual(expected['files'], result['files']) diff --git a/packagedb/models.py b/packagedb/models.py index 51cba3ab..a4d8e7ac 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -628,7 +628,7 @@ def reindex(self): """ from minecode.model_utils import add_package_to_scan_queue - add_package_to_scan_queue(self, reindex_uri=True) + add_package_to_scan_queue(self, reindex_uri=True, priority=100) def update_fields(self, save=False, **values_by_fields): """ diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index 94cba66f..da14b8cc 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -11,7 +11,6 @@ import json import os -from django.contrib.postgres.search import SearchVector from django.test import TestCase from django.urls import reverse from django.utils import timezone @@ -512,25 +511,28 @@ def setUp(self): self.scannableuri.scan_uuid = self.scan_uuid self.scannableuri.scan_error = 'error' self.scannableuri.index_error = 'error' - self.scan_request_date = timezone.now() - self.scannableuri.scan_request_date = self.scan_request_date + self.scan_date = timezone.now() + self.scannableuri.scan_date = self.scan_date def test_reindex_package(self): - self.assertEqual(False, self.scannableuri.rescan_uri) + self.assertEqual(1, ScannableURI.objects.all().count()) + response = self.client.get(f'/api/packages/{self.package.uuid}/reindex_package/') + self.assertEqual('pkg:maven/sample/Baz@90.12 has been queued for reindexing', response.data['status']) + self.assertEqual(2, ScannableURI.objects.all().count()) + new_scannable_uri = ScannableURI.objects.exclude(pk=self.scannableuri.pk).first() + self.assertEqual(self.package, new_scannable_uri.package) + self.assertEqual(True, new_scannable_uri.reindex_uri) + self.assertEqual(100, new_scannable_uri.priority) + self.assertEqual(None, new_scannable_uri.scan_error) + self.assertEqual(None, new_scannable_uri.index_error) + self.assertEqual(None, new_scannable_uri.scan_date) + + # Ensure previous ScannableURI was not modified + self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) - self.assertEqual(self.scan_uuid, self.scannableuri.scan_uuid) self.assertEqual('error', self.scannableuri.scan_error) self.assertEqual('error', self.scannableuri.index_error) - self.assertEqual(self.scan_request_date, self.scannableuri.scan_request_date) - response = self.client.get(f'/api/packages/{self.package.uuid}/reindex_package/') - self.assertEqual('pkg:maven/sample/Baz@90.12 has been queued for reindexing', response.data['status']) - self.scannableuri.refresh_from_db() - self.assertEqual(True, self.scannableuri.rescan_uri) - self.assertEqual(100, self.scannableuri.priority) - self.assertEqual(None, self.scannableuri.scan_uuid) - self.assertEqual(None, self.scannableuri.scan_error) - self.assertEqual(None, self.scannableuri.index_error) - self.assertEqual(None, self.scannableuri.scan_request_date) + self.assertEqual(self.scan_date, self.scannableuri.scan_date) class PackageApiPurlFilterTestCase(JsonBasedTesting, TestCase): @@ -724,7 +726,7 @@ class CollectApiTestCase(JsonBasedTesting, TestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): - package_download_url = 'http://anotherexample.com' + self.package_download_url = 'http://anotherexample.com' self.package_data = { 'type': 'maven', 'namespace': 'sample', @@ -732,7 +734,7 @@ def setUp(self): 'version': '90.12', 'qualifiers': '', 'subpath': '', - 'download_url': package_download_url, + 'download_url': self.package_download_url, 'filename': 'Baz.zip', 'sha1': 'testsha1-3', 'md5': 'testmd5-3', @@ -742,9 +744,9 @@ def setUp(self): self.package.refresh_from_db() self.scannableuri = ScannableURI.objects.create( package=self.package, - uri=package_download_url, + uri=self.package_download_url, ) - self.scannableuri.scan_status = ScannableURI.SCAN_INDEXED + self.scannableuri.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid = uuid4() self.scannableuri.scan_uuid = self.scan_uuid self.scannableuri.scan_error = 'error' @@ -752,7 +754,7 @@ def setUp(self): self.scan_request_date = timezone.now() self.scannableuri.scan_request_date = self.scan_request_date - package_download_url2 = 'http://somethingelse.org' + self.package_download_url2 = 'http://somethingelse.org' self.package_data2 = { 'type': 'npm', 'namespace': 'example', @@ -760,7 +762,7 @@ def setUp(self): 'version': '56.78', 'qualifiers': '', 'subpath': '', - 'download_url': package_download_url2, + 'download_url': self.package_download_url2, 'filename': 'Bar.zip', 'sha1': 'testsha1-2', 'md5': 'testmd5-2', @@ -770,16 +772,16 @@ def setUp(self): self.package2.refresh_from_db() self.scannableuri2 = ScannableURI.objects.create( package=self.package2, - uri=package_download_url2, + uri=self.package_download_url2, ) - self.scannableuri2.scan_status = ScannableURI.SCAN_INDEXED + self.scannableuri2.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid2 = uuid4() self.scannableuri2.scan_uuid = self.scan_uuid2 self.scannableuri2.scan_error = 'error' self.scannableuri2.index_error = 'error' self.scan_request_date2 = timezone.now() self.scannableuri2.scan_request_date = self.scan_request_date2 - + def test_package_live(self): purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' @@ -805,7 +807,7 @@ def test_package_live(self): ] self.check_expected_results(result, expected, fields_to_remove=fields_to_remove, regen=False) - + def test_package_api_index_packages_endpoint(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) @@ -980,21 +982,25 @@ def test_package_api_index_packages_endpoint_all_version_index(self, mock_get_al self.assertEqual([], response.data["unsupported_packages"]) priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(13, priority_resource_uris_count) - + def test_reindex_packages_bulk(self): - self.assertEqual(False, self.scannableuri.rescan_uri) + self.assertEqual(2, ScannableURI.objects.all().count()) + + self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) self.assertEqual(self.scan_uuid, self.scannableuri.scan_uuid) self.assertEqual('error', self.scannableuri.scan_error) self.assertEqual('error', self.scannableuri.index_error) self.assertEqual(self.scan_request_date, self.scannableuri.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri.scan_status) - self.assertEqual(False, self.scannableuri2.rescan_uri) + self.assertEqual(False, self.scannableuri2.reindex_uri) self.assertEqual(0, self.scannableuri2.priority) self.assertEqual(self.scan_uuid2, self.scannableuri2.scan_uuid) self.assertEqual('error', self.scannableuri2.scan_error) self.assertEqual('error', self.scannableuri2.index_error) self.assertEqual(self.scan_request_date2, self.scannableuri2.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri2.scan_status) packages = [ # Existing package @@ -1015,7 +1021,7 @@ def test_reindex_packages_bulk(self): 'pkg:maven/sample/Baz@90.12', 'pkg:npm/example/bar@56.78', ] - + unsupported_purls = [ 'pkg:pypi/does/not-exist@1', ] @@ -1033,23 +1039,18 @@ def test_reindex_packages_bulk(self): self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) + self.assertEqual(4, ScannableURI.objects.all().count()) + new_scannable_uris = ScannableURI.objects.exclude(pk__in=[self.scannableuri.pk, self.scannableuri2.pk]) + self.assertEqual(2, new_scannable_uris.count()) - self.scannableuri.refresh_from_db() - self.assertEqual(True, self.scannableuri.rescan_uri) - self.assertEqual(100, self.scannableuri.priority) - self.assertEqual(None, self.scannableuri.scan_uuid) - self.assertEqual(None, self.scannableuri.scan_error) - self.assertEqual(None, self.scannableuri.index_error) - self.assertEqual(None, self.scannableuri.scan_request_date) + for scannable_uri in new_scannable_uris: + self.assertEqual(True, scannable_uri.reindex_uri) + self.assertEqual(100, scannable_uri.priority) + self.assertEqual(ScannableURI.SCAN_NEW, scannable_uri.scan_status) + self.assertEqual(None, scannable_uri.scan_error) + self.assertEqual(None, scannable_uri.index_error) + self.assertEqual(None, scannable_uri.scan_date) - self.scannableuri2.refresh_from_db() - self.assertEqual(True, self.scannableuri2.rescan_uri) - self.assertEqual(100, self.scannableuri.priority) - self.assertEqual(None, self.scannableuri2.scan_uuid) - self.assertEqual(None, self.scannableuri2.scan_error) - self.assertEqual(None, self.scannableuri2.index_error) - self.assertEqual(None, self.scannableuri2.scan_request_date) - class ResourceApiTestCase(TestCase): @@ -1126,7 +1127,7 @@ def setUp(self): } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() - + def test_api_purl_validation(self): data1 = { "purl": "pkg:npm/foobar@1.1.0", @@ -1151,7 +1152,7 @@ def test_api_purl_validation(self): self.assertEqual( "The provided PackageURL is not valid.", response2.data["message"] ) - + def test_api_purl_validation_unsupported_package_type(self): data1 = { "purl": "pkg:random/foobar@1.1.0", @@ -1169,7 +1170,7 @@ def test_api_purl_validation_unsupported_package_type(self): def test_api_purl_validation_empty_request(self): data1 = {} response1 = self.client.get(f"/api/validate/", data=data1) - + expected = { "errors": { "purl": [ From e5b78b2db3d30b76aaccf5de250c3962146087db Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 28 Feb 2024 17:51:09 -0800 Subject: [PATCH 12/31] Create command to manage in progress scans #49 #285 Signed-off-by: Jono Yang --- minecode/management/commands/manage_scans.py | 169 +++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 minecode/management/commands/manage_scans.py diff --git a/minecode/management/commands/manage_scans.py b/minecode/management/commands/manage_scans.py new file mode 100644 index 00000000..a361f227 --- /dev/null +++ b/minecode/management/commands/manage_scans.py @@ -0,0 +1,169 @@ +# +# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# + +import time +import logging +import signal +import sys + +from django.db import transaction +from django.utils import timezone + + +from minecode.models import ScannableURI + +from minecode.management.commands import VerboseCommand + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + + +# sleep duration in seconds when the queue is empty +SLEEP_WHEN_EMPTY = 1 + + +class ScanningCommand(VerboseCommand): + """ + Base command class for processing ScannableURIs. + """ + # subclasses must override + logger = None + + def add_arguments(self, parser): + parser.add_argument( + '--exit-on-empty', + dest='exit_on_empty', + default=False, + action='store_true', + help='Do not loop forever. Exit when the queue is empty.') + + parser.add_argument( + '--max-uris', + dest='max_uris', + default=0, + action='store', + help='Limit the number of Scannable URIs processed to a maximum number. ' + '0 means no limit. Used only for testing.') + + def handle(self, *args, **options): + exit_on_empty = options.get('exit_on_empty') + max_uris = options.get('max_uris', 0) + + uris_counter = self.process_scans( + exit_on_empty=exit_on_empty, + max_uris=max_uris, + # Pass options to allow subclasses to add their own options + options=options + ) + self.stdout.write('Processed {} ScannableURI.'.format(uris_counter)) + + @classmethod + def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): + """ + Run an infinite scan processing loop. Return a processed URis count. + + Get the next available candidate ScannableURI and request a scan from + ScanCode.io. Loops forever and sleeps a short while if there are no + ScannableURI left to scan. + """ + uris_counter = 0 + sleeping = False + + while True: + # Wait before processing anything + time.sleep(10) + + if cls.MUST_STOP: + cls.logger.info('Graceful exit of the scan processing loop.') + break + + if max_uris and uris_counter >= max_uris: + cls.logger.info('max_uris requested reached: exiting scan processing loop.') + break + + scannable_uri = cls.get_next_uri() + + if not scannable_uri: + if exit_on_empty: + cls.logger.info('exit-on-empty requested: No more scannable URIs, exiting...') + break + + # Only log a single message when we go to sleep + if not sleeping: + sleeping = True + cls.logger.info('No more scannable URIs, sleeping for at least {} seconds...'.format(SLEEP_WHEN_EMPTY)) + + time.sleep(SLEEP_WHEN_EMPTY) + continue + + cls.logger.info('Processing scannable URI: {}'.format(scannable_uri)) + + cls.process_scan(scannable_uri, **kwargs) + uris_counter += 1 + sleeping = False + + return uris_counter + + @classmethod + def get_next_uri(self): + """ + Return a locked ScannableURI for processing. + Subclasses must implement + + Typically something like: + with transaction.atomic(): + scannable_uri = ScannableURI.objects.get_next_scannable() + """ + pass + + @classmethod + def process_scan(scannable_uri, **kwargs): + """ + Process a single `scannable_uri` ScannableURI. Subclasses must implement. + If sucessfully processed the ScannableURI must be updated accordingly. + """ + pass + + +class Command(ScanningCommand): + + logger = logger + + help = ('Check scancode.io requested scans for status then fetch and process ' + 'completed scans for indexing and updates.') + + def handle(self, *args, **options): + logger.setLevel(self.get_verbosity(**options)) + ScanningCommand.handle(self, *args, **options) + + @classmethod + def get_next_uri(self): + with transaction.atomic(): + scannable_uri = ScannableURI.objects.get_next_processable() + return scannable_uri + + @classmethod + def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_save_loc='', **kwargs): + """ + Manage a ScannableURI based on its status. + - For submitted but not completed scans, check the timestamp of when the scan was submitted, if it has been past some time, then we set the scan as timed out + - For timed out scans, we set that as failed and then create a new one? + """ + logger.info('Checking scan for URI: {}'.format(scannable_uri)) + + if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): + scan_duration = timezone.now() - scannable_uri.scan_date + scan_duration_hours = scan_duration.seconds / (60 * 60) + + if scan_duration_hours > 2: + scannable_uri.scan_status = ScannableURI.SCAN_TIMEOUT + scannable_uri.wip_date = None + scannable_uri.save() + logger.info('Scan for URI has timed out: {}'.format(scannable_uri)) + + +# support graceful death when used as a service +signal.signal(signal.SIGTERM, Command.stop_handler) From 42ae3e6679e8f5093d4200bd5a84e80cd4edbf02 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 1 Mar 2024 18:12:58 -0800 Subject: [PATCH 13/31] Pass pipelines to be run to scancode.io Signed-off-by: Jono Yang --- minecode/api.py | 2 ++ minecode/collectors/maven.py | 2 +- minecode/collectors/npm.py | 2 +- minecode/management/commands/priority_queue.py | 1 + minecode/model_utils.py | 5 ++++- minecode/models.py | 7 +++++++ minecode/visitors/generic.py | 2 +- packagedb/find_source_repo.py | 2 +- 8 files changed, 18 insertions(+), 5 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 95121d56..269e53f0 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -108,6 +108,7 @@ def get_next_download_url(self, request, *args, **kwargs): response = { 'scannable_uri_uuid': scannable_uri.uuid, 'download_url': scannable_uri.uri, + 'pipelines': scannable_uri.pipelines, } scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED scannable_uri.scan_date = timezone.now() @@ -116,6 +117,7 @@ def get_next_download_url(self, request, *args, **kwargs): response = { 'scannable_uri_uuid': "", 'download_url': "", + 'pipelines': [], } return Response(response) diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index ceb0c17a..a4f78422 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -251,7 +251,7 @@ def map_maven_package(package_url, package_content): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package) + add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) return db_package, error diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py index 818e645f..5cf229e5 100644 --- a/minecode/collectors/npm.py +++ b/minecode/collectors/npm.py @@ -59,7 +59,7 @@ def map_npm_package(package_url): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package) + add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) return error diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 1558cbf7..58a907ed 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -113,6 +113,7 @@ def add_package_to_scan_queue(package): _, scannable_uri_created = ScannableURI.objects.get_or_create( uri=uri, package=package, + pipelines=['scan_and_fingerprint_package'], ) if scannable_uri_created: logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 78a966eb..0f0848f0 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -24,13 +24,16 @@ logger.setLevel(logging.INFO) -def add_package_to_scan_queue(package, reindex_uri=False, priority=0): +def add_package_to_scan_queue(package, pipelines, reindex_uri=False, priority=0): """ Add a Package `package` to the scan queue """ + if not pipelines: + raise Exception('pipelines required to add package to scan queue') uri = package.download_url _, scannable_uri_created = ScannableURI.objects.get_or_create( uri=uri, + pipelines=pipelines, package=package, reindex_uri=reindex_uri, priority=priority, diff --git a/minecode/models.py b/minecode/models.py index 16d121cf..86b7b4c4 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -678,6 +678,13 @@ class ScannableURI(BaseURI): help_text='URL to scan project for this Package', ) + pipelines = models.JSONField( + default=list, + blank=True, + editable=False, + help_text=_('A list of ScanCode.io pipeline names to be run for this scan'), + ) + SCAN_NEW = 0 SCAN_SUBMITTED = 1 SCAN_IN_PROGRESS = 2 diff --git a/minecode/visitors/generic.py b/minecode/visitors/generic.py index 628b3ba8..5ff8b6b6 100644 --- a/minecode/visitors/generic.py +++ b/minecode/visitors/generic.py @@ -51,7 +51,7 @@ def map_generic_package(package_url): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package) + add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) return error diff --git a/packagedb/find_source_repo.py b/packagedb/find_source_repo.py index 2ec7fe18..a9d09a44 100644 --- a/packagedb/find_source_repo.py +++ b/packagedb/find_source_repo.py @@ -179,7 +179,7 @@ def add_source_repo_to_package_set( for package_set in package_sets.all(): package_set.add_to_package_set(source_repo_package) if created: - add_package_to_scan_queue(source_repo_package) + add_package_to_scan_queue(source_repo_package, pipelines=['scan_and_fingerprint_package']) logger.info(f"\tCreated source repo package {source_purl} for {purl}") else: logger.info( From 5856cde04768f596e4f2dbe52240a6ade6e7a47c Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 4 Mar 2024 17:56:45 -0800 Subject: [PATCH 14/31] Create scan_queue_workers group #49 #285 * Guard scan_queue API endpoint Signed-off-by: Jono Yang --- minecode/api.py | 6 +- .../commands/create-scan-queue-worker-user.py | 119 ++++++++++++++++++ .../migrations/0036_scannableuri_pipelines.py | 23 ++++ .../migrations/0037_auto_20240304_1933.py | 39 ++++++ minecode/models.py | 2 +- minecode/permissions.py | 9 ++ minecode/tests/test_api.py | 38 ++++-- packagedb/models.py | 2 +- 8 files changed, 221 insertions(+), 17 deletions(-) create mode 100644 minecode/management/commands/create-scan-queue-worker-user.py create mode 100644 minecode/migrations/0036_scannableuri_pipelines.py create mode 100644 minecode/migrations/0037_auto_20240304_1933.py create mode 100644 minecode/permissions.py diff --git a/minecode/api.py b/minecode/api.py index 269e53f0..f4bb6599 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -7,9 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from django.utils import timezone import json + from django.db import transaction +from django.utils import timezone from packageurl import PackageURL from rest_framework import serializers, status, viewsets from rest_framework.decorators import action @@ -21,6 +22,7 @@ from minecode import priority_router from minecode.management.indexing import index_package_files from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI +from minecode.permissions import IsScanQueueWorkerAPIUser class ResourceURISerializer(serializers.ModelSerializer): @@ -92,10 +94,10 @@ class Meta: fields = '__all__' -# TODO: guard these API endpoints behind an API key class ScannableURIViewSet(viewsets.ModelViewSet): queryset = ScannableURI.objects.all() serializer_class = ScannableURISerializer + permission_classes = [IsScanQueueWorkerAPIUser] @action(detail=False, methods=["get"]) def get_next_download_url(self, request, *args, **kwargs): diff --git a/minecode/management/commands/create-scan-queue-worker-user.py b/minecode/management/commands/create-scan-queue-worker-user.py new file mode 100644 index 00000000..d3996fd2 --- /dev/null +++ b/minecode/management/commands/create-scan-queue-worker-user.py @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import getpass + +from django.contrib.auth.models import Group, Permission +from django.contrib.auth import get_user_model +from django.contrib.auth.password_validation import validate_password +from django.core import exceptions +from django.core.management.base import BaseCommand +from django.core.management.base import CommandError + +from rest_framework.authtoken.models import Token + + +class Command(BaseCommand): + help = "Create a user and generate an API key for authentication." + requires_migrations_checks = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.UserModel = get_user_model() + self.username_field = self.UserModel._meta.get_field( + self.UserModel.USERNAME_FIELD + ) + + def add_arguments(self, parser): + parser.add_argument("username", help="Specifies the username for the user.") + parser.add_argument( + "--no-input", + action="store_false", + dest="interactive", + help="Do not prompt the user for input of any kind.", + ) + + def handle(self, *args, **options): + username = options["username"] + + error_msg = self._validate_username(username) + if error_msg: + raise CommandError(error_msg) + + password = None + if options["interactive"]: + password = self.get_password_from_stdin(username) + + user = self.UserModel._default_manager.create_user(username, password=password) + # Add user to `scan_queue_workers` group + scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') + scan_queue_workers_group.user_set.add(user) + token, _ = Token._default_manager.get_or_create(user=user) + + if options["verbosity"] >= 1: + msg = f"User {username} created with API key: {token.key}" + self.stdout.write(msg, self.style.SUCCESS) + + def get_password_from_stdin(self, username): + # Validators, such as UserAttributeSimilarityValidator, depends on other user's + # fields data for password validation. + fake_user_data = { + self.UserModel.USERNAME_FIELD: username, + } + + password = None + while password is None: + password1 = getpass.getpass() + password2 = getpass.getpass("Password (again): ") + if password1 != password2: + self.stderr.write("Error: Your passwords didn't match.") + continue + if password1.strip() == "": + self.stderr.write("Error: Blank passwords aren't allowed.") + continue + try: + validate_password(password2, self.UserModel(**fake_user_data)) + except exceptions.ValidationError as err: + self.stderr.write("\n".join(err.messages)) + response = input( + "Bypass password validation and create user anyway? [y/N]: " + ) + if response.lower() != "y": + continue + password = password1 + + return password + + def _validate_username(self, username): + """Validate username. If invalid, return a string error message.""" + if self.username_field.unique: + try: + self.UserModel._default_manager.get_by_natural_key(username) + except self.UserModel.DoesNotExist: + pass + else: + return "Error: That username is already taken." + + try: + self.username_field.clean(username, None) + except exceptions.ValidationError as e: + return "; ".join(e.messages) diff --git a/minecode/migrations/0036_scannableuri_pipelines.py b/minecode/migrations/0036_scannableuri_pipelines.py new file mode 100644 index 00000000..9d50ceb3 --- /dev/null +++ b/minecode/migrations/0036_scannableuri_pipelines.py @@ -0,0 +1,23 @@ +# Generated by Django 5.0.1 on 2024-03-04 19:18 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("minecode", "0035_rename_rescan_uri_scannableuri_reindex_uri"), + ] + + operations = [ + migrations.AddField( + model_name="scannableuri", + name="pipelines", + field=models.JSONField( + blank=True, + default=list, + editable=False, + help_text="A list of ScanCode.io pipeline names to be run for this scan", + ), + ), + ] diff --git a/minecode/migrations/0037_auto_20240304_1933.py b/minecode/migrations/0037_auto_20240304_1933.py new file mode 100644 index 00000000..1b93ee38 --- /dev/null +++ b/minecode/migrations/0037_auto_20240304_1933.py @@ -0,0 +1,39 @@ +# Generated by Django 5.0.1 on 2024-03-04 19:33 + +from django.db import migrations + + +from django.contrib.auth.models import Group, Permission +from django.contrib.contenttypes.models import ContentType + + +def create_scan_queue_workers_group_and_permissions(apps, schema_editor): + """ + Create the `scan_workers` groups and set permissions for it + """ + ScannableURI = apps.get_model('minecode', 'ScannableURI') + scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') + # Add permissions to `scan_queue_workers` group + scannable_uri_content_type = ContentType.objects.get_for_model(ScannableURI) + names_by_codenames = { + 'change_scannableuri': 'Can change Scannable URI', + 'view_scannableuri': 'Can view Scannable URI', + } + for codename, name in names_by_codenames.items(): + permission, _ = Permission.objects.get_or_create( + codename=codename, + name=name, + content_type=scannable_uri_content_type + ) + scan_queue_workers_group.permissions.add(permission) + + +class Migration(migrations.Migration): + + dependencies = [ + ("minecode", "0036_scannableuri_pipelines"), + ] + + operations = [ + migrations.RunPython(create_scan_queue_workers_group_and_permissions, migrations.RunPython.noop), + ] diff --git a/minecode/models.py b/minecode/models.py index 86b7b4c4..deaba41b 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -682,7 +682,7 @@ class ScannableURI(BaseURI): default=list, blank=True, editable=False, - help_text=_('A list of ScanCode.io pipeline names to be run for this scan'), + help_text='A list of ScanCode.io pipeline names to be run for this scan', ) SCAN_NEW = 0 diff --git a/minecode/permissions.py b/minecode/permissions.py new file mode 100644 index 00000000..83815cea --- /dev/null +++ b/minecode/permissions.py @@ -0,0 +1,9 @@ +from rest_framework import permissions + + +class IsScanQueueWorkerAPIUser(permissions.BasePermission): + """ + Allow access to a user who is a part of the `scan_queue_workers` group + """ + def has_permission(self, request, view): + return request.user.groups.filter(name='scan_queue_workers').exists() diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index 35f98582..b379c525 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -7,33 +7,33 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from uuid import uuid4 -import json import os -from django.contrib.postgres.search import SearchVector +from django.contrib.auth.models import Group, User from django.test import TestCase -from django.urls import reverse -from django.utils import timezone from rest_framework import status from rest_framework.test import APIClient -from minecode.models import PriorityResourceURI from minecode.models import ScannableURI from minecode.utils_test import JsonBasedTesting -from packagedb.models import Package -from packagedb.models import PackageContentType -from packagedb.models import PackageSet -from packagedb.models import Resource -from minecode.models import ScannableURI +from packagedb.models import Package, Resource -from unittest import mock -from univers.versions import MavenVersion class ScannableURIAPITestCase(JsonBasedTesting, TestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): + self.user = User.objects.create_user( + username="username", + email="e@mail.com", + password="secret" + ) + scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') + scan_queue_workers_group.user_set.add(self.user) + self.auth = f"Token {self.user.auth_token.key}" + self.csrf_client = APIClient(enforce_csrf_checks=True) + self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) + self.package1 = Package.objects.create( download_url='https://test-url.com/package1.tar.gz', type='type1', @@ -60,11 +60,17 @@ def setUp(self): def test_api_scannable_uri_list_endpoint(self): response = self.client.get('/api/scan_queue/') + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + response = self.csrf_client.get('/api/scan_queue/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(2, response.data.get('count')) def test_api_scannable_uri_get_next_download_url(self): response = self.client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri1.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri1.uri) @@ -88,6 +94,12 @@ def test_api_scannable_uri_update_status(self): 'scan_project_url': 'scan_project_url', } response = self.client.post('/api/scan_queue/update_status/', data=data) + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + self.scannable_uri1.refresh_from_db() + self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) + + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() self.assertEqual(ScannableURI.SCAN_IN_PROGRESS, self.scannable_uri1.scan_status) diff --git a/packagedb/models.py b/packagedb/models.py index a4d8e7ac..7ad6629c 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -628,7 +628,7 @@ def reindex(self): """ from minecode.model_utils import add_package_to_scan_queue - add_package_to_scan_queue(self, reindex_uri=True, priority=100) + add_package_to_scan_queue(self, pipelines=['scan_and_fingerprint_package'], reindex_uri=True, priority=100) def update_fields(self, save=False, **values_by_fields): """ From 86b0885857dabb0f2c65ab2b07dd02d8fbf05fa5 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 6 Mar 2024 17:04:13 -0800 Subject: [PATCH 15/31] Update ScannableURI migrations #49 #285 Signed-off-by: Jono Yang --- ...minecode_sc_scan_st_d6a459_idx_and_more.py | 34 ++++++++++++------- ...> 0035_create_scan_queue_workers_group.py} | 2 +- ...ame_rescan_uri_scannableuri_reindex_uri.py | 18 ---------- .../migrations/0036_scannableuri_pipelines.py | 23 ------------- minecode/models.py | 17 ---------- 5 files changed, 22 insertions(+), 72 deletions(-) rename minecode/migrations/{0037_auto_20240304_1933.py => 0035_create_scan_queue_workers_group.py} (95%) delete mode 100644 minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py delete mode 100644 minecode/migrations/0036_scannableuri_pipelines.py diff --git a/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py b/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py index 7c31cc30..937adaab 100644 --- a/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py +++ b/minecode/migrations/0032_remove_scannableuri_minecode_sc_scan_st_d6a459_idx_and_more.py @@ -1,7 +1,7 @@ -# Generated by Django 5.0.1 on 2024-02-15 23:16 +# Generated by Django 5.0.1 on 2024-03-07 00:59 -from django.db import migrations, models import uuid +from django.db import migrations, models class Migration(migrations.Migration): @@ -16,28 +16,32 @@ class Migration(migrations.Migration): model_name="scannableuri", name="minecode_sc_scan_st_d6a459_idx", ), + migrations.RenameField( + model_name="scannableuri", + old_name="rescan_uri", + new_name="reindex_uri", + ), migrations.AlterUniqueTogether( name="scannableuri", unique_together=set(), ), migrations.AddField( model_name="scannableuri", - name="scan_date", - field=models.DateTimeField( + name="pipelines", + field=models.JSONField( blank=True, - db_index=True, - help_text="Timestamp set to the date when a scan was taken by a worker", - null=True, + default=list, + editable=False, + help_text="A list of ScanCode.io pipeline names to be run for this scan", ), ), migrations.AddField( model_name="scannableuri", - name="scan_project_url", - field=models.CharField( + name="scan_date", + field=models.DateTimeField( blank=True, db_index=True, - help_text="URL to scan project for this Package", - max_length=2048, + help_text="Timestamp set to the date when a scan was taken by a worker", null=True, ), ), @@ -49,10 +53,14 @@ class Migration(migrations.Migration): migrations.AddIndex( model_name="scannableuri", index=models.Index( - fields=["scan_status", "scan_date", "last_status_poll_date"], - name="minecode_sc_scan_st_5e04d7_idx", + fields=["scan_status", "scan_date"], + name="minecode_sc_scan_st_baab37_idx", ), ), + migrations.RemoveField( + model_name="scannableuri", + name="last_status_poll_date", + ), migrations.RemoveField( model_name="scannableuri", name="scan_request_date", diff --git a/minecode/migrations/0037_auto_20240304_1933.py b/minecode/migrations/0035_create_scan_queue_workers_group.py similarity index 95% rename from minecode/migrations/0037_auto_20240304_1933.py rename to minecode/migrations/0035_create_scan_queue_workers_group.py index 1b93ee38..786e6017 100644 --- a/minecode/migrations/0037_auto_20240304_1933.py +++ b/minecode/migrations/0035_create_scan_queue_workers_group.py @@ -31,7 +31,7 @@ def create_scan_queue_workers_group_and_permissions(apps, schema_editor): class Migration(migrations.Migration): dependencies = [ - ("minecode", "0036_scannableuri_pipelines"), + ("minecode", "0034_scannableuri_alter_uuid_field"), ] operations = [ diff --git a/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py b/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py deleted file mode 100644 index 7719561d..00000000 --- a/minecode/migrations/0035_rename_rescan_uri_scannableuri_reindex_uri.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 5.0.1 on 2024-02-21 02:12 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("minecode", "0034_scannableuri_alter_uuid_field"), - ] - - operations = [ - migrations.RenameField( - model_name="scannableuri", - old_name="rescan_uri", - new_name="reindex_uri", - ), - ] diff --git a/minecode/migrations/0036_scannableuri_pipelines.py b/minecode/migrations/0036_scannableuri_pipelines.py deleted file mode 100644 index 9d50ceb3..00000000 --- a/minecode/migrations/0036_scannableuri_pipelines.py +++ /dev/null @@ -1,23 +0,0 @@ -# Generated by Django 5.0.1 on 2024-03-04 19:18 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("minecode", "0035_rename_rescan_uri_scannableuri_reindex_uri"), - ] - - operations = [ - migrations.AddField( - model_name="scannableuri", - name="pipelines", - field=models.JSONField( - blank=True, - default=list, - editable=False, - help_text="A list of ScanCode.io pipeline names to be run for this scan", - ), - ), - ] diff --git a/minecode/models.py b/minecode/models.py index deaba41b..5050e74a 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -662,22 +662,6 @@ class ScannableURI(BaseURI): help_text='Timestamp set to the date when a scan was taken by a worker', ) - last_status_poll_date = models.DateTimeField( - null=True, - blank=True, - db_index=True, - help_text='Timestamp set to the date of the last status poll. ' - 'Used to track the scan polling.', - ) - - scan_project_url = models.CharField( - max_length=2048, - db_index=True, - null=True, - blank=True, - help_text='URL to scan project for this Package', - ) - pipelines = models.JSONField( default=list, blank=True, @@ -751,7 +735,6 @@ class Meta: fields=[ 'scan_status', 'scan_date', - 'last_status_poll_date', ] ), # ordered by for the main queue query e.g. '-priority' From 4169da5bf6b415c3d444d7502a99aab9bf41bf1f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 6 Mar 2024 17:38:21 -0800 Subject: [PATCH 16/31] Do not track scan progress from purldb #49 #285 Signed-off-by: Jono Yang --- minecode/api.py | 16 +--------------- minecode/tests/test_api.py | 24 +++++------------------- 2 files changed, 6 insertions(+), 34 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index f4bb6599..78cd96c5 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -135,21 +135,7 @@ def update_status(self, request, *args, **kwargs): scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) - if scan_status == 'in progress': - scan_project_url = request.data.get('scan_project_url') - if scan_project_url: - scannable_uri.scan_project_url = scan_project_url - scannable_uri.scan_status = ScannableURI.SCAN_IN_PROGRESS - scannable_uri.save() - msg = { - 'status': f'scan_status updated to {scan_status} for scannable_uri {scannable_uri_uuid}' - } - else: - msg = { - 'status': f'missing scan_project_url when updating scannable_uri {scannable_uri_uuid} scan_status to {scan_status}' - } - - elif scan_status == 'failed': + if scan_status == 'failed': scan_log = request.data.get('scan_log') scannable_uri.scan_error = scan_log scannable_uri.scan_status = ScannableURI.SCAN_FAILED diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index b379c525..87d3f73e 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -75,12 +75,12 @@ def test_api_scannable_uri_get_next_download_url(self): self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri1.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri1.uri) - response = self.client.get('/api/scan_queue/get_next_download_url/') + response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri2.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri2.uri) - response = self.client.get('/api/scan_queue/get_next_download_url/') + response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), '') self.assertEqual(response.data.get('download_url'), '') @@ -88,29 +88,15 @@ def test_api_scannable_uri_get_next_download_url(self): def test_api_scannable_uri_update_status(self): self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) - data = { - "scannable_uri_uuid": self.scannable_uri1.uuid, - "scan_status": 'in progress', - 'scan_project_url': 'scan_project_url', - } - response = self.client.post('/api/scan_queue/update_status/', data=data) + response = self.client.post('/api/scan_queue/update_status/') self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - self.scannable_uri1.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) - - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.scannable_uri1.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_IN_PROGRESS, self.scannable_uri1.scan_status) - self.assertEqual('scan_project_url', self.scannable_uri1.scan_project_url) - data = { "scannable_uri_uuid": self.scannable_uri1.uuid, "scan_status": 'failed', 'scan_log': 'scan_log', } - response = self.client.post('/api/scan_queue/update_status/', data=data) + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) @@ -124,7 +110,7 @@ def test_api_scannable_uri_update_status(self): "scan_status": 'scanned', 'scan_file': f, } - response = self.client.post('/api/scan_queue/update_status/', data=data) + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri1.scan_status) From 38c58057649731a3554a10c99548390266834ad5 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 7 Mar 2024 18:36:37 -0800 Subject: [PATCH 17/31] Share code between user creation commands #49 #285 Signed-off-by: Jono Yang --- .../commands/create-scan-queue-worker-user.py | 102 +++--------------- minecode/management/commands/create-user.py | 37 +++++++ .../management/user_creation.py | 16 ++- 3 files changed, 63 insertions(+), 92 deletions(-) create mode 100644 minecode/management/commands/create-user.py rename packagedb/management/commands/create-user.py => minecode/management/user_creation.py (91%) diff --git a/minecode/management/commands/create-scan-queue-worker-user.py b/minecode/management/commands/create-scan-queue-worker-user.py index d3996fd2..e7409730 100644 --- a/minecode/management/commands/create-scan-queue-worker-user.py +++ b/minecode/management/commands/create-scan-queue-worker-user.py @@ -20,100 +20,24 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. -import getpass +from django.contrib.auth.models import Group +from minecode.management.user_creation import CreateUserCommand -from django.contrib.auth.models import Group, Permission -from django.contrib.auth import get_user_model -from django.contrib.auth.password_validation import validate_password -from django.core import exceptions -from django.core.management.base import BaseCommand -from django.core.management.base import CommandError -from rest_framework.authtoken.models import Token - - -class Command(BaseCommand): - help = "Create a user and generate an API key for authentication." - requires_migrations_checks = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.UserModel = get_user_model() - self.username_field = self.UserModel._meta.get_field( - self.UserModel.USERNAME_FIELD - ) - - def add_arguments(self, parser): - parser.add_argument("username", help="Specifies the username for the user.") - parser.add_argument( - "--no-input", - action="store_false", - dest="interactive", - help="Do not prompt the user for input of any kind.", - ) +class Command(CreateUserCommand): + help = "Create a user and generate an API key for a scan queue worker" def handle(self, *args, **options): username = options["username"] - - error_msg = self._validate_username(username) - if error_msg: - raise CommandError(error_msg) - - password = None - if options["interactive"]: - password = self.get_password_from_stdin(username) - - user = self.UserModel._default_manager.create_user(username, password=password) + interactive = options["interactive"] + verbosity = options["verbosity"] + user = self.create_user( + username=username, + interactive=interactive, + verbosity=verbosity + ) # Add user to `scan_queue_workers` group scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') scan_queue_workers_group.user_set.add(user) - token, _ = Token._default_manager.get_or_create(user=user) - - if options["verbosity"] >= 1: - msg = f"User {username} created with API key: {token.key}" - self.stdout.write(msg, self.style.SUCCESS) - - def get_password_from_stdin(self, username): - # Validators, such as UserAttributeSimilarityValidator, depends on other user's - # fields data for password validation. - fake_user_data = { - self.UserModel.USERNAME_FIELD: username, - } - - password = None - while password is None: - password1 = getpass.getpass() - password2 = getpass.getpass("Password (again): ") - if password1 != password2: - self.stderr.write("Error: Your passwords didn't match.") - continue - if password1.strip() == "": - self.stderr.write("Error: Blank passwords aren't allowed.") - continue - try: - validate_password(password2, self.UserModel(**fake_user_data)) - except exceptions.ValidationError as err: - self.stderr.write("\n".join(err.messages)) - response = input( - "Bypass password validation and create user anyway? [y/N]: " - ) - if response.lower() != "y": - continue - password = password1 - - return password - - def _validate_username(self, username): - """Validate username. If invalid, return a string error message.""" - if self.username_field.unique: - try: - self.UserModel._default_manager.get_by_natural_key(username) - except self.UserModel.DoesNotExist: - pass - else: - return "Error: That username is already taken." - - try: - self.username_field.clean(username, None) - except exceptions.ValidationError as e: - return "; ".join(e.messages) + msg = f"User {username} added to `scan_queue_workers` group" + self.stdout.write(msg, self.style.SUCCESS) diff --git a/minecode/management/commands/create-user.py b/minecode/management/commands/create-user.py new file mode 100644 index 00000000..794b9b65 --- /dev/null +++ b/minecode/management/commands/create-user.py @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from minecode.management.user_creation import CreateUserCommand + + +class Command(CreateUserCommand): + help = "Create a user and generate an API key for a scan queue worker" + + def handle(self, *args, **options): + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] + self.create_user( + username=username, + interactive=interactive, + verbosity=verbosity + ) diff --git a/packagedb/management/commands/create-user.py b/minecode/management/user_creation.py similarity index 91% rename from packagedb/management/commands/create-user.py rename to minecode/management/user_creation.py index 64d25cdc..7f63b75a 100644 --- a/packagedb/management/commands/create-user.py +++ b/minecode/management/user_creation.py @@ -31,7 +31,7 @@ from rest_framework.authtoken.models import Token -class Command(BaseCommand): +class CreateUserCommand(BaseCommand): help = "Create a user and generate an API key for authentication." requires_migrations_checks = True @@ -53,22 +53,32 @@ def add_arguments(self, parser): def handle(self, *args, **options): username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] + self.create_user( + username=username, + interactive=interactive, + verbosity=verbosity + ) + def create_user(self, username, interactive, verbosity): error_msg = self._validate_username(username) if error_msg: raise CommandError(error_msg) password = None - if options["interactive"]: + if interactive: password = self.get_password_from_stdin(username) user = self.UserModel._default_manager.create_user(username, password=password) token, _ = Token._default_manager.get_or_create(user=user) - if options["verbosity"] >= 1: + if verbosity >= 1: msg = f"User {username} created with API key: {token.key}" self.stdout.write(msg, self.style.SUCCESS) + return user + def get_password_from_stdin(self, username): # Validators, such as UserAttributeSimilarityValidator, depends on other user's # fields data for password validation. From d46c4b480b27324baef8d8286c83877eea7f9741 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 14 Mar 2024 19:29:11 -0700 Subject: [PATCH 18/31] Set default pipelines to be run Signed-off-by: Jono Yang --- minecode/collectors/maven.py | 2 +- minecode/collectors/npm.py | 2 +- minecode/model_utils.py | 13 +++++++++++-- minecode/visitors/generic.py | 2 +- packagedb/find_source_repo.py | 2 +- packagedb/models.py | 2 +- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index a4f78422..ceb0c17a 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -251,7 +251,7 @@ def map_maven_package(package_url, package_content): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) + add_package_to_scan_queue(db_package) return db_package, error diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py index 5cf229e5..818e645f 100644 --- a/minecode/collectors/npm.py +++ b/minecode/collectors/npm.py @@ -59,7 +59,7 @@ def map_npm_package(package_url): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) + add_package_to_scan_queue(db_package) return error diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 0f0848f0..caa542d2 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -24,9 +24,18 @@ logger.setLevel(logging.INFO) -def add_package_to_scan_queue(package, pipelines, reindex_uri=False, priority=0): +# These are the list of default pipelines to run when we scan a Package for +# indexing +DEFAULT_PIPELINES = ( + 'scan_and_fingerprint_package', +) + + +def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, reindex_uri=False, priority=0): """ - Add a Package `package` to the scan queue + Add a Package `package` to the scan queue to run the list of provided `pipelines` + + If `reindex_uri` is True, force rescanning of the package """ if not pipelines: raise Exception('pipelines required to add package to scan queue') diff --git a/minecode/visitors/generic.py b/minecode/visitors/generic.py index 5ff8b6b6..628b3ba8 100644 --- a/minecode/visitors/generic.py +++ b/minecode/visitors/generic.py @@ -51,7 +51,7 @@ def map_generic_package(package_url): # Submit package for scanning if db_package: - add_package_to_scan_queue(db_package, pipelines=['scan_and_fingerprint_package']) + add_package_to_scan_queue(db_package) return error diff --git a/packagedb/find_source_repo.py b/packagedb/find_source_repo.py index a9d09a44..2ec7fe18 100644 --- a/packagedb/find_source_repo.py +++ b/packagedb/find_source_repo.py @@ -179,7 +179,7 @@ def add_source_repo_to_package_set( for package_set in package_sets.all(): package_set.add_to_package_set(source_repo_package) if created: - add_package_to_scan_queue(source_repo_package, pipelines=['scan_and_fingerprint_package']) + add_package_to_scan_queue(source_repo_package) logger.info(f"\tCreated source repo package {source_purl} for {purl}") else: logger.info( diff --git a/packagedb/models.py b/packagedb/models.py index 7ad6629c..a4d8e7ac 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -628,7 +628,7 @@ def reindex(self): """ from minecode.model_utils import add_package_to_scan_queue - add_package_to_scan_queue(self, pipelines=['scan_and_fingerprint_package'], reindex_uri=True, priority=100) + add_package_to_scan_queue(self, reindex_uri=True, priority=100) def update_fields(self, save=False, **values_by_fields): """ From ef57b49ef882cee6283a4d8766bb23db65500a71 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 14 Mar 2024 21:48:03 -0700 Subject: [PATCH 19/31] Get scan summary data and use it in index_package #49 Signed-off-by: Jono Yang --- minecode/api.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 78cd96c5..7a1174c2 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -20,7 +20,7 @@ # But importing the mappers and visitors module triggers routes registration from minecode import visitors # NOQA from minecode import priority_router -from minecode.management.indexing import index_package_files +from minecode.management.indexing import index_package from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser @@ -147,10 +147,12 @@ def update_status(self, request, *args, **kwargs): elif scan_status == 'scanned': scan_file = request.data.get('scan_file') + summary_file = request.data.get('summary_file') scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED package = scannable_uri.package - scan_data= json.load(scan_file) - indexing_errors = index_package_files(package, scan_data, reindex=True) + scan_data = json.load(scan_file) + summary_data = json.load(summary_file) + indexing_errors = index_package(package, scan_data, summary_data, reindex=True) if indexing_errors: scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors From ccdee328241c6955b05999e7b669ae751f9aeac6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 15 Mar 2024 19:12:11 -0700 Subject: [PATCH 20/31] Fix Package indexing issues #49 * Do not overwrite spdx license fields as these are generated * set Package fields to be nullable Signed-off-by: Jono Yang --- minecode/api.py | 10 ++--- minecode/management/indexing.py | 6 --- ..._type_alter_resource_mime_type_and_more.py | 43 +++++++++++++++++++ packagedb/models.py | 3 ++ 4 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 packagedb/migrations/0084_alter_resource_file_type_alter_resource_mime_type_and_more.py diff --git a/minecode/api.py b/minecode/api.py index 7a1174c2..1a4dd45a 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -146,13 +146,13 @@ def update_status(self, request, *args, **kwargs): } elif scan_status == 'scanned': - scan_file = request.data.get('scan_file') - summary_file = request.data.get('summary_file') + scan_results_file = request.data.get('scan_results_file') + scan_summary_file = request.data.get('scan_summary_file') scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED package = scannable_uri.package - scan_data = json.load(scan_file) - summary_data = json.load(summary_file) - indexing_errors = index_package(package, scan_data, summary_data, reindex=True) + scan_data = json.load(scan_results_file) + summary_data = json.load(scan_summary_file) + indexing_errors = index_package(scannable_uri, package, scan_data, summary_data, reindex=True) if indexing_errors: scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors diff --git a/minecode/management/indexing.py b/minecode/management/indexing.py index 45e15e8c..7902b10d 100644 --- a/minecode/management/indexing.py +++ b/minecode/management/indexing.py @@ -9,7 +9,6 @@ import sys from minecode.model_utils import merge_or_create_resource from packagedcode.utils import combine_expressions -from licensedcode.cache import build_spdx_license_expression import traceback from minecode.models import ScannableURI @@ -78,10 +77,6 @@ def index_package(scannable_uri, package, scan_data, summary_data, reindex=False indexing_errors = index_package_files(package, scan_data, reindex=reindex) scan_index_errors.extend(indexing_errors) declared_license_expression = summary_data.get('declared_license_expression') - declared_license_expression_spdx = None - if declared_license_expression: - declared_license_expression_spdx = build_spdx_license_expression(declared_license_expression) - other_license_expressions = summary_data.get('other_license_expressions', []) other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] other_license_expression = combine_expressions(other_license_expressions) @@ -94,7 +89,6 @@ def index_package(scannable_uri, package, scan_data, summary_data, reindex=False values_by_updateable_fields = { 'summary': summary_data, 'declared_license_expression': declared_license_expression, - 'declared_license_expression_spdx': declared_license_expression_spdx, 'other_license_expression': other_license_expression, 'copyright': copyright, } diff --git a/packagedb/migrations/0084_alter_resource_file_type_alter_resource_mime_type_and_more.py b/packagedb/migrations/0084_alter_resource_file_type_alter_resource_mime_type_and_more.py new file mode 100644 index 00000000..952ae294 --- /dev/null +++ b/packagedb/migrations/0084_alter_resource_file_type_alter_resource_mime_type_and_more.py @@ -0,0 +1,43 @@ +# Generated by Django 5.0.1 on 2024-03-16 00:44 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("packagedb", "0083_delete_apiuser"), + ] + + operations = [ + migrations.AlterField( + model_name="resource", + name="file_type", + field=models.CharField( + blank=True, + help_text="Descriptive file type for this resource.", + max_length=1024, + null=True, + ), + ), + migrations.AlterField( + model_name="resource", + name="mime_type", + field=models.CharField( + blank=True, + help_text="MIME type (aka. media type) for this resource. See https://en.wikipedia.org/wiki/Media_type", + max_length=100, + null=True, + ), + ), + migrations.AlterField( + model_name="resource", + name="programming_language", + field=models.CharField( + blank=True, + help_text="Programming language of this resource if this is a code file.", + max_length=50, + null=True, + ), + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index a4d8e7ac..18d0ea7b 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -966,6 +966,7 @@ class AbstractResource(models.Model): mime_type = models.CharField( max_length=100, blank=True, + null=True, help_text=_( "MIME type (aka. media type) for this resource. " "See https://en.wikipedia.org/wiki/Media_type" @@ -975,12 +976,14 @@ class AbstractResource(models.Model): file_type = models.CharField( max_length=1024, blank=True, + null=True, help_text=_("Descriptive file type for this resource."), ) programming_language = models.CharField( max_length=50, blank=True, + null=True, help_text=_("Programming language of this resource if this is a code file."), ) From 4921a9af924d822389b980e08bc34eddd6793e70 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Sat, 16 Mar 2024 00:55:27 -0700 Subject: [PATCH 21/31] Update package checksums using project extra data #49 Signed-off-by: Jono Yang --- minecode/api.py | 11 +++++++++- minecode/management/indexing.py | 39 ++++++++++----------------------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 1a4dd45a..fa66150b 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -148,11 +148,20 @@ def update_status(self, request, *args, **kwargs): elif scan_status == 'scanned': scan_results_file = request.data.get('scan_results_file') scan_summary_file = request.data.get('scan_summary_file') + project_extra_data = request.data.get('project_extra_data') scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED package = scannable_uri.package scan_data = json.load(scan_results_file) summary_data = json.load(scan_summary_file) - indexing_errors = index_package(scannable_uri, package, scan_data, summary_data, reindex=True) + project_extra_data = json.loads(project_extra_data) + indexing_errors = index_package( + scannable_uri, + package, + scan_data, + summary_data, + project_extra_data, + reindex=True + ) if indexing_errors: scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors diff --git a/minecode/management/indexing.py b/minecode/management/indexing.py index 7902b10d..2e7c1d9e 100644 --- a/minecode/management/indexing.py +++ b/minecode/management/indexing.py @@ -71,7 +71,7 @@ def index_package_files(package, scan_data, reindex=False): return scan_index_errors -def index_package(scannable_uri, package, scan_data, summary_data, reindex=False): +def index_package(scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False): scan_index_errors = [] try: indexing_errors = index_package_files(package, scan_data, reindex=reindex) @@ -86,38 +86,21 @@ def index_package(scannable_uri, package, scan_data, summary_data, reindex=False if declared_holder: copyright = f'Copyright (c) {declared_holder}' + checksums_and_size_by_field = { + k: v + for k, v in project_extra_data.items() + if k in [ + 'md5','sha1', 'size', 'sha256', 'sha512', 'filename' + ] + } values_by_updateable_fields = { 'summary': summary_data, 'declared_license_expression': declared_license_expression, 'other_license_expression': other_license_expression, 'copyright': copyright, + **checksums_and_size_by_field } - - updated_fields = [] - for field, value in values_by_updateable_fields.items(): - p_val = getattr(package, field) - if ( - (not p_val and value) - or reindex - ): - setattr(package, field, value) - entry = dict( - field=field, - old_value=p_val, - new_value=value, - ) - updated_fields.append(entry) - - if updated_fields: - data = { - 'updated_fields': updated_fields, - } - package.append_to_history( - 'Package field values have been updated.', - data=data, - save=True, - ) - + package.update_fields(save=True, **values_by_updateable_fields) scannable_uri.scan_status = ScannableURI.SCAN_INDEXED except Exception as e: traceback_message = traceback.format_exc() @@ -125,5 +108,7 @@ def index_package(scannable_uri, package, scan_data, summary_data, reindex=False # TODO: We should rerun the specific indexers that have failed if scan_index_errors: error_message += '\n'.join(scan_index_errors) + logger.error(error_message) scannable_uri.index_error = error_message scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED + scannable_uri.save() From 1192879a464df67e1059012305dbec6bce560bb3 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Sat, 16 Mar 2024 15:37:05 -0700 Subject: [PATCH 22/31] Update update_status error responses #49 * Test to see if package data can be updated from indexing Signed-off-by: Jono Yang --- minecode/api.py | 23 ++++++++++-- minecode/tests/test_api.py | 72 ++++++++++++++++++++++++++++++++------ packagedb/models.py | 4 +-- 3 files changed, 82 insertions(+), 17 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index fa66150b..362e3a8f 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -8,6 +8,7 @@ # import json +import uuid from django.db import transaction from django.utils import timezone @@ -43,6 +44,14 @@ class Meta: fields = '__all__' +def validate_uuid(uuid_string): + try: + val = uuid.UUID(uuid_string) + except ValueError: + return False + return str(val).lower() == uuid_string.lower() + + class PriorityResourceURIViewSet(viewsets.ModelViewSet): queryset = PriorityResourceURI.objects.all() serializer_class = PriorityResourceURISerializer @@ -131,7 +140,13 @@ def update_status(self, request, *args, **kwargs): response = { 'error': 'missing scannable_uri_uuid' } - return Response(response) + return Response(response, status=status.HTTP_400_BAD_REQUEST) + + if not validate_uuid(scannable_uri_uuid): + response = { + 'error': f'invalid scannable_uri_uuid: {scannable_uri_uuid}' + } + return Response(response, status=status.HTTP_400_BAD_REQUEST) scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) @@ -166,8 +181,9 @@ def update_status(self, request, *args, **kwargs): scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors msg = { - 'status': f'scan index failed for scannable uri {scannable_uri_uuid}' + 'error': f'scan index failed for scannable uri {scannable_uri_uuid}' } + return Response(msg, status=status.HTTP_400_BAD_REQUEST) else: scannable_uri.scan_status = ScannableURI.SCAN_INDEXED msg = { @@ -178,7 +194,8 @@ def update_status(self, request, *args, **kwargs): else: msg = { - 'status': f'invalid scan_status: {scan_status}' + 'error': f'invalid scan_status: {scan_status}' } + return Response(msg, status=status.HTTP_400_BAD_REQUEST) return Response(msg) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index 87d3f73e..701953f5 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import os from django.contrib.auth.models import Group, User @@ -102,17 +103,66 @@ def test_api_scannable_uri_update_status(self): self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) self.assertEqual('scan_log', self.scannable_uri1.scan_error) + self.assertFalse(self.package2.md5) + self.assertFalse(self.package2.sha1) + self.assertFalse(self.package2.sha256) + self.assertFalse(self.package2.sha512) + self.assertFalse(self.package2.size) + self.assertFalse(self.package2.declared_license_expression) + self.assertFalse(self.package2.copyright) self.assertEqual(0, Resource.objects.all().count()) - scan_file = self.get_test_loc('scancodeio/get_scan_data.json') - with open(scan_file) as f: - data = { - "scannable_uri_uuid": self.scannable_uri1.uuid, - "scan_status": 'scanned', - 'scan_file': f, - } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + summary_file_location = self.get_test_loc('scancodeio/scan_summary_response.json') + project_extra_data = { + 'md5': 'md5', + 'sha1': 'sha1', + 'sha256': 'sha256', + 'sha512': 'sha512', + 'size': 100, + } + with open(scan_file_location) as scan_file: + with open(summary_file_location) as summary_file: + data = { + "scannable_uri_uuid": self.scannable_uri2.uuid, + "scan_status": 'scanned', + 'project_extra_data': json.dumps(project_extra_data), + 'scan_results_file': scan_file, + 'scan_summary_file': summary_file, + } + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.scannable_uri1.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri1.scan_status) - self.assertEqual('scan_log', self.scannable_uri1.scan_error) + self.scannable_uri2.refresh_from_db() + self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri2.scan_status) + self.package2.refresh_from_db() + self.assertEqual('md5', self.package2.md5) + self.assertEqual('sha1', self.package2.sha1) + self.assertEqual('sha256', self.package2.sha256) + self.assertEqual('sha512', self.package2.sha512) + self.assertEqual(100, self.package2.size) + self.assertEqual('apache-2.0', self.package2.declared_license_expression) + self.assertEqual('Copyright (c) Apache Software Foundation', self.package2.copyright) + self.assertFalse(self.scannable_uri2.scan_error) self.assertEqual(64, Resource.objects.all().count()) + + data = {} + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + expected_response = {'error': 'missing scannable_uri_uuid'} + self.assertEqual(expected_response, response.data) + + data = { + 'scannable_uri_uuid': self.scannable_uri2.uuid, + 'scan_status': 'invalid' + } + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + expected_response = {'error': 'invalid scan_status: invalid'} + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(expected_response, response.data) + + data = { + 'scannable_uri_uuid': 'asdf', + 'scan_status': 'scanned' + } + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + expected_response = {'error': 'invalid scannable_uri_uuid: asdf'} + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(expected_response, response.data) diff --git a/packagedb/models.py b/packagedb/models.py index 18d0ea7b..3ce27716 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -794,6 +794,7 @@ def update_fields(self, save=False, **values_by_fields): self.append_to_history( 'Package field values have been updated.', data=data, + save=save, ) updated_fields.append('history') @@ -804,9 +805,6 @@ def update_fields(self, save=False, **values_by_fields): # Deduplicate field names updated_fields = list(set(updated_fields)) - if save: - self.save() - return self, updated_fields From b0b8dddf69209ac3d101f06e9cafeafa1c03b9f1 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 11:56:51 -0700 Subject: [PATCH 23/31] Do not allow updates of finished ScannableURIs #49 #285 Signed-off-by: Jono Yang --- minecode/api.py | 39 +++++++++++++++++++++++-------- minecode/management/indexing.py | 6 ++++- minecode/models.py | 6 +++++ minecode/tests/test_api.py | 41 ++++++++++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 11 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 362e3a8f..5ed8a53f 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -149,6 +149,33 @@ def update_status(self, request, *args, **kwargs): return Response(response, status=status.HTTP_400_BAD_REQUEST) scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) + scannable_uri_status = ScannableURI.SCAN_STATUSES_BY_CODE.get(scannable_uri.scan_status) + scan_status_code = ScannableURI.SCAN_STATUS_CODES_BY_SCAN_STATUS.get(scan_status) + + if not scan_status_code: + msg = { + 'error': f'invalid scan_status: {scan_status}' + } + return Response(msg, status=status.HTTP_400_BAD_REQUEST) + + if scannable_uri.scan_status in [ + ScannableURI.SCAN_INDEXED, + ScannableURI.SCAN_FAILED, + ScannableURI.SCAN_TIMEOUT, + ScannableURI.SCAN_INDEX_FAILED, + ]: + response = { + 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' + f'scannable_uri has finished with status "{scannable_uri_status}"' + } + return Response(response, status=status.HTTP_400_BAD_REQUEST) + + if scan_status == scannable_uri_status: + response = { + 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' + f'scannable_uri status is already "{scannable_uri_status}"' + } + return Response(response, status=status.HTTP_400_BAD_REQUEST) if scan_status == 'failed': scan_log = request.data.get('scan_log') @@ -157,7 +184,7 @@ def update_status(self, request, *args, **kwargs): scannable_uri.wip_date = None scannable_uri.save() msg = { - 'status': f'updated scannable uri {scannable_uri_uuid} scan_status to {scan_status}' + 'status': f'updated scannable_uri {scannable_uri_uuid} scan_status to {scan_status}' } elif scan_status == 'scanned': @@ -181,21 +208,15 @@ def update_status(self, request, *args, **kwargs): scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED scannable_uri.index_error = indexing_errors msg = { - 'error': f'scan index failed for scannable uri {scannable_uri_uuid}' + 'error': f'scan index failed for scannable_uri {scannable_uri_uuid}' } return Response(msg, status=status.HTTP_400_BAD_REQUEST) else: scannable_uri.scan_status = ScannableURI.SCAN_INDEXED msg = { - 'status': f'scan indexed for scannable uri {scannable_uri_uuid}' + 'status': f'scan indexed for scannable_uri {scannable_uri_uuid}' } scannable_uri.wip_date = None scannable_uri.save() - else: - msg = { - 'error': f'invalid scan_status: {scan_status}' - } - return Response(msg, status=status.HTTP_400_BAD_REQUEST) - return Response(msg) diff --git a/minecode/management/indexing.py b/minecode/management/indexing.py index 2e7c1d9e..593e53d7 100644 --- a/minecode/management/indexing.py +++ b/minecode/management/indexing.py @@ -100,8 +100,12 @@ def index_package(scannable_uri, package, scan_data, summary_data, project_extra 'copyright': copyright, **checksums_and_size_by_field } - package.update_fields(save=True, **values_by_updateable_fields) + _, updated_fields = package.update_fields(save=True, **values_by_updateable_fields) + updated_fields = ', '.join(updated_fields) + message = f'Updated fields for Package {package.purl}: {updated_fields}' + logger.info(message) scannable_uri.scan_status = ScannableURI.SCAN_INDEXED + scannable_uri.save() except Exception as e: traceback_message = traceback.format_exc() error_message = traceback_message + '\n' diff --git a/minecode/models.py b/minecode/models.py index 5050e74a..41f2caa3 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -691,6 +691,12 @@ class ScannableURI(BaseURI): SCAN_STATUSES_BY_CODE = dict(SCAN_STATUS_CHOICES) + SCAN_STATUS_CODES_BY_SCAN_STATUS = { + status: code + for code, status + in SCAN_STATUS_CHOICES + } + scan_status = models.IntegerField( default=SCAN_NEW, choices=SCAN_STATUS_CHOICES, diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index 701953f5..291d79dd 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -57,6 +57,17 @@ def setUp(self): package=self.package2 ) + self.package3 = Package.objects.create( + download_url='https://test-url.com/package3.tar.gz', + type='type3', + name='name3', + version='3.0', + ) + self.scannable_uri3 = ScannableURI.objects.create( + uri='https://test-url.com/package3.tar.gz', + package=self.package3 + ) + self.client = APIClient() def test_api_scannable_uri_list_endpoint(self): @@ -65,7 +76,7 @@ def test_api_scannable_uri_list_endpoint(self): response = self.csrf_client.get('/api/scan_queue/') self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(3, response.data.get('count')) def test_api_scannable_uri_get_next_download_url(self): response = self.client.get('/api/scan_queue/get_next_download_url/') @@ -81,6 +92,11 @@ def test_api_scannable_uri_get_next_download_url(self): self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri2.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri2.uri) + response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri3.uuid) + self.assertEqual(response.data.get('download_url'), self.scannable_uri3.uri) + response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), '') @@ -166,3 +182,26 @@ def test_api_scannable_uri_update_status(self): expected_response = {'error': 'invalid scannable_uri_uuid: asdf'} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) + + def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): + scannable_uri_uuid = self.scannable_uri3.uuid + for scan_status in [ + ScannableURI.SCAN_INDEXED, + ScannableURI.SCAN_FAILED, + ScannableURI.SCAN_TIMEOUT, + ScannableURI.SCAN_INDEX_FAILED, + ]: + self.scannable_uri3.scan_status = scan_status + self.scannable_uri3.save() + data = { + 'scannable_uri_uuid': scannable_uri_uuid, + 'scan_status': 'scanned' + } + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + expected_response = { + 'error': 'cannot update status for scannable_uri ' + f'{self.scannable_uri3.uuid}: scannable_uri has finished ' + f'with status "{ScannableURI.SCAN_STATUSES_BY_CODE[scan_status]}"' + } + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(expected_response, response.data) From ff83b091395c5b940ad9673acf24ee35932c4ec7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 12:57:09 -0700 Subject: [PATCH 24/31] Move validate_uuid to utils.py #49 #285 * Add tests for validate_uuid * Test for missing scan_status in update_status Signed-off-by: Jono Yang --- minecode/api.py | 31 +++++++++++++++++++++---------- minecode/tests/test_api.py | 9 +++++++++ minecode/tests/test_utils.py | 13 +++++++++++++ minecode/utils.py | 9 +++++++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 5ed8a53f..d287cc5a 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -8,13 +8,13 @@ # import json -import uuid from django.db import transaction from django.utils import timezone from packageurl import PackageURL from rest_framework import serializers, status, viewsets from rest_framework.decorators import action +from rest_framework.permissions import IsAdminUser from rest_framework.response import Response # UnusedImport here! @@ -24,6 +24,7 @@ from minecode.management.indexing import index_package from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser +from minecode.utils import validate_uuid class ResourceURISerializer(serializers.ModelSerializer): @@ -44,14 +45,6 @@ class Meta: fields = '__all__' -def validate_uuid(uuid_string): - try: - val = uuid.UUID(uuid_string) - except ValueError: - return False - return str(val).lower() == uuid_string.lower() - - class PriorityResourceURIViewSet(viewsets.ModelViewSet): queryset = PriorityResourceURI.objects.all() serializer_class = PriorityResourceURISerializer @@ -106,7 +99,7 @@ class Meta: class ScannableURIViewSet(viewsets.ModelViewSet): queryset = ScannableURI.objects.all() serializer_class = ScannableURISerializer - permission_classes = [IsScanQueueWorkerAPIUser] + permission_classes = [IsScanQueueWorkerAPIUser|IsAdminUser] @action(detail=False, methods=["get"]) def get_next_download_url(self, request, *args, **kwargs): @@ -134,6 +127,18 @@ def get_next_download_url(self, request, *args, **kwargs): @action(detail=False, methods=["post"]) def update_status(self, request, *args, **kwargs): + """ + Update the status of a ScannableURI with UUID of `scannable_uri_uuid` + with `scan_status` + + If `scan_status` is 'failed', then a `scan_log` string is expected and + should contain the error messages for that scan. + + If `scan_status` is 'scanned', then a `scan_results_file`, + `scan_summary_file`, and `project_extra_data` mapping are expected. + `scan_results_file`, `scan_summary_file`, and `project_extra_data` are + then used to update Package data and its Resources. + """ scannable_uri_uuid = request.data.get('scannable_uri_uuid') scan_status = request.data.get('scan_status') if not scannable_uri_uuid: @@ -142,6 +147,12 @@ def update_status(self, request, *args, **kwargs): } return Response(response, status=status.HTTP_400_BAD_REQUEST) + if not scan_status: + response = { + 'error': 'missing scan_status' + } + return Response(response, status=status.HTTP_400_BAD_REQUEST) + if not validate_uuid(scannable_uri_uuid): response = { 'error': f'invalid scannable_uri_uuid: {scannable_uri_uuid}' diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index 291d79dd..f492191b 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -165,6 +165,15 @@ def test_api_scannable_uri_update_status(self): expected_response = {'error': 'missing scannable_uri_uuid'} self.assertEqual(expected_response, response.data) + data = { + 'scannable_uri_uuid': self.scannable_uri2.uuid, + 'scan_status': '' + } + response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + expected_response = {'error': 'missing scan_status'} + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(expected_response, response.data) + data = { 'scannable_uri_uuid': self.scannable_uri2.uuid, 'scan_status': 'invalid' diff --git a/minecode/tests/test_utils.py b/minecode/tests/test_utils.py index 66e0dde9..f242faf8 100644 --- a/minecode/tests/test_utils.py +++ b/minecode/tests/test_utils.py @@ -17,6 +17,7 @@ from minecode.utils_test import JsonBasedTesting from minecode.utils import is_int from minecode.utils import stringify_null_purl_fields +from minecode.utils import validate_uuid class UtilsTest(JsonBasedTesting, DjangoTestCase): @@ -66,3 +67,15 @@ def test_set_purl(self): def test_is_int(self): self.assertTrue(is_int(0)) self.assertFalse(is_int('a')) + + def test_validate_uuid(self): + invalid_uuid1 = 'invalid' + invalid_uuid2 = '123e4567-e89b-12d3-a456-42665544000G' + valid_uuid = 'c2cf7ef0-d3be-4011-bda7-8eb4a196eef2' + + for uuid, expected_result in [ + [invalid_uuid1, False], + [invalid_uuid2, False], + [valid_uuid, True], + ]: + self.assertEqual(expected_result, validate_uuid(uuid)) diff --git a/minecode/utils.py b/minecode/utils.py index 1bcd023b..df5ea526 100644 --- a/minecode/utils.py +++ b/minecode/utils.py @@ -12,6 +12,7 @@ import logging import os import tempfile +import uuid from django.conf import settings from django.utils.encoding import force_str @@ -331,3 +332,11 @@ def form_vcs_url(vcs_tool, vcs_url, revision_tag_or_branch=None, sub_path=None): if sub_path: vcs_url = '#'.join(str(v) for v in [vcs_url, sub_path]) return vcs_url + + +def validate_uuid(uuid_string): + try: + val = uuid.UUID(uuid_string) + except ValueError: + return False + return str(val).lower() == uuid_string.lower() From 4ecd97a74532c94e8eb4391518deba91350f0c77 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 14:06:24 -0700 Subject: [PATCH 25/31] Move code from minecode/collectors to original location #49 #285 Signed-off-by: Jono Yang --- minecode/collectors/__init__.py | 0 minecode/collectors/maven.py | 694 ------------------ minecode/collectors/npm.py | 84 --- .../commands/get_maven_release_dates.py | 4 +- minecode/management/commands/import_queue.py | 12 +- minecode/management/commands/maven_crawler.py | 2 +- minecode/tests/test_indexing.py | 36 +- minecode/tests/test_maven.py | 89 ++- minecode/tests/test_npm.py | 5 +- minecode/visitors/maven.py | 687 ++++++++++++++++- minecode/visitors/npm.py | 80 +- packagedb/find_source_repo.py | 2 +- ...resource-filter_by_checksums-expected.json | 12 +- 13 files changed, 857 insertions(+), 850 deletions(-) delete mode 100644 minecode/collectors/__init__.py delete mode 100644 minecode/collectors/maven.py delete mode 100644 minecode/collectors/npm.py diff --git a/minecode/collectors/__init__.py b/minecode/collectors/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py deleted file mode 100644 index ceb0c17a..00000000 --- a/minecode/collectors/maven.py +++ /dev/null @@ -1,694 +0,0 @@ -import hashlib -import logging -import re -from typing import Dict -from urllib.parse import urlparse - -import requests -from packagedcode.maven import _parse, get_maven_pom, get_urls -from packageurl import PackageURL - -from minecode import priority_router -from minecode.visitors.maven import MAVEN_BASE_URL -from packagedb.models import PackageContentType, PackageRelation, make_relationship - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -TRACE = False -TRACE_DEEP = False - -if TRACE: - logger.setLevel(logging.DEBUG) - - -def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): - """ - Return the contents of the POM file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - if qualifiers and not isinstance(qualifiers, Dict): - return - urls = get_urls( - namespace=namespace, - name=name, - version=version, - qualifiers=qualifiers, - base_url=base_url, - ) - # Get and parse POM info - pom_url = urls["api_data_url"] - # TODO: manage different types of errors (404, etc.) - response = requests.get(pom_url) - if not response: - return - return response.text - - -def get_package_sha1(package): - """ - Return the sha1 value for `package` by checking if the sha1 file exists for - `package` on maven and returning the contents if it does. - - If the sha1 is invalid, we download the package's JAR and calculate the sha1 - from that. - """ - download_url = package.repository_download_url - sha1_download_url = f"{download_url}.sha1" - response = requests.get(sha1_download_url) - if response.ok: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - if not sha1: - # Download JAR and calculate sha1 if we cannot get it from the repo - response = requests.get(download_url) - if response: - sha1_hash = hashlib.new("sha1", response.content) - sha1 = sha1_hash.hexdigest() - return sha1 - - -def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): - """ - Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. - """ - if not pom_text: - return - pom = get_maven_pom(text=pom_text) - if ( - pom.parent - and pom.parent.group_id - and pom.parent.artifact_id - and pom.parent.version.version - ): - parent_namespace = pom.parent.group_id - parent_name = pom.parent.artifact_id - parent_version = str(pom.parent.version.version) - parent_pom_text = get_pom_text( - namespace=parent_namespace, - name=parent_name, - version=parent_version, - qualifiers={}, - base_url=base_url, - ) - return parent_pom_text - - -def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): - """ - Return a list of pom text of the ancestors of `pom`. The list is ordered - from oldest ancestor to newest. The list is empty is there is no parent pom. - """ - ancestors = [] - has_parent = True - while has_parent: - parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) - if not parent_pom_text: - has_parent = False - else: - ancestors.append(parent_pom_text) - pom_text = parent_pom_text - return reversed(ancestors) - - -def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): - """ - Merge package details of a package with its ancestor pom - and return the merged package. - """ - if not package: - return - pom_text = get_pom_text( - name=package.name, - namespace=package.namespace, - version=package.version, - qualifiers=package.qualifiers, - base_url=base_url, - ) - merged_package = merge_ancestors( - ancestor_pom_texts=get_ancestry(pom_text), - package=package, - ) - return merged_package - - -def merge_parent(package, parent_package): - """ - Merge `parent_package` data into `package` and return `package. - """ - mergeable_fields = ( - "declared_license_expression", - "homepage_url", - "parties", - ) - for field in mergeable_fields: - # If `field` is empty on the package we're looking at, populate - # those fields with values from the parent package. - if not getattr(package, field): - value = getattr(parent_package, field) - setattr(package, field, value) - - msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" - history = package.extra_data.get("history") - if history: - package.extra_data["history"].append(msg) - else: - package.extra_data["history"] = [msg] - - return package - - -def merge_ancestors(ancestor_pom_texts, package): - """ - Merge metadata from `ancestor_pom_text` into `package`. - - The order of POM content in `ancestor_pom_texts` is expected to be in the - order of oldest ancestor to newest. - """ - for ancestor_pom_text in ancestor_pom_texts: - ancestor_package = _parse( - datasource_id="maven_pom", - package_type="maven", - primary_language="Java", - text=ancestor_pom_text, - ) - package = merge_parent(package, ancestor_package) - return package - - -def map_maven_package(package_url, package_content): - """ - Add a maven `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package - - db_package = None - error = "" - - if "repository_url" in package_url.qualifiers: - base_url = package_url.qualifiers["repository_url"] - else: - base_url = MAVEN_BASE_URL - - pom_text = get_pom_text( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - if not pom_text: - msg = f"Package does not exist on maven: {package_url}" - error += msg + "\n" - logger.error(msg) - return db_package, error - - package = _parse( - "maven_pom", - "maven", - "Java", - text=pom_text, - base_url=base_url, - ) - ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) - package = merge_ancestors(ancestor_pom_texts=ancestor_pom_texts, package=package) - - urls = get_urls( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - # In the case of looking up a maven package with qualifiers of - # `classifiers=sources`, the purl of the package created from the pom does - # not have the qualifiers, so we need to set them. Additionally, the download - # url is not properly generated since it would be missing the sources bit - # from the filename. - package.qualifiers = package_url.qualifiers - package.download_url = urls["repository_download_url"] - package.repository_download_url = urls["repository_download_url"] - - # Set package_content value - package.extra_data["package_content"] = package_content - - # If sha1 exists for a jar, we know we can create the package - # Use pom info as base and create packages for binary and source package - - # Check to see if binary is available - sha1 = get_package_sha1(package) - if sha1: - package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) - else: - msg = f"Failed to retrieve JAR: {package_url}" - error += msg + "\n" - logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package) - - return db_package, error - - -def validate_sha1(sha1): - """ - Validate a `sha1` string. - - Return `sha1` if it is valid, None otherwise. - """ - if sha1 and len(sha1) != 40: - logger.warning(f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!') - sha1 = None - return sha1 - - -def map_maven_binary_and_source(package_url): - """ - Get metadata for the binary and source release of the Maven package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = "" - package, emsg = map_maven_package(package_url, PackageContentType.BINARY) - if emsg: - error += emsg - - source_package_url = package_url - source_package_url.qualifiers["classifier"] = "sources" - source_package, emsg = map_maven_package( - source_package_url, PackageContentType.SOURCE_ARCHIVE - ) - if emsg: - error += emsg - - if package and source_package: - make_relationship( - from_package=source_package, - to_package=package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE, - ) - - return error - - -def map_maven_packages(package_url): - """ - Given a valid `package_url` with no version, get metadata for the binary and - source release for each version of the Maven package `package_url` and save - it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = "" - namespace = package_url.namespace - name = package_url.name - # Find all versions of this package - query_params = f"g:{namespace}+AND+a:{name}" - url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" - response = requests.get(url) - if response: - package_listings = response.json().get("response", {}).get("docs", []) - for listing in package_listings: - purl = PackageURL( - type="maven", - namespace=listing.get("g"), - name=listing.get("a"), - version=listing.get("v"), - ) - emsg = map_maven_binary_and_source(purl) - if emsg: - error += emsg - return error - - -@priority_router.route("pkg:maven/.*") -def process_request(purl_str): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from maven and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f"error occured when parsing {purl_str}: {e}" - return error - - has_version = bool(package_url.version) - if has_version: - error = map_maven_binary_and_source(package_url) - else: - error = map_maven_packages(package_url) - - return error - - -collect_links = re.compile(r'href="([^"]+)"').findall -collect_links_and_artifact_timestamps = re.compile( - r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' -).findall - - -def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): - """ - Return True if `file_name` is in `links` - """ - return any(l.endswith(file_name) for l in links) - - -def check_if_page_has_pom_files(links, **kwargs): - """ - Return True of any entry in `links` ends with .pom. - """ - return any(l.endswith(".pom") for l in links) - - -def check_if_page_has_directories(links, **kwargs): - """ - Return True if any entry, excluding "../", ends with /. - """ - return any(l.endswith("/") for l in links if l != "../") - - -def check_if_package_version_page(links, **kwargs): - """ - Return True if `links` contains pom files and has no directories - """ - return check_if_page_has_pom_files( - links=links - ) and not check_if_page_has_directories(links=links) - - -def check_if_package_page(links, **kwargs): - return check_if_file_name_is_linked_on_page( - file_name="maven-metadata.xml", links=links - ) and not check_if_page_has_pom_files(links=links) - - -def check_if_maven_root(links, **kwargs): - """ - Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven - repo contains "archetype-catalog.xml". - """ - return check_if_file_name_is_linked_on_page( - file_name="archetype-catalog.xml", links=links - ) - - -def check_on_page(url, checker): - """ - Return True if there is a link on `url` that is the same as `file_name`, - False otherwise. - """ - response = requests.get(url) - if response: - links = collect_links(response.text) - return checker(links=links) - return False - - -def is_maven_root(url): - """ - Return True if `url` is the root of a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_maven_root) - - -def is_package_page(url): - """ - Return True if `url` is a package page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_page) - - -def is_package_version_page(url): - """ - Return True if `url` is a package version page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_version_page) - - -def url_parts(url): - parsed_url = urlparse(url) - scheme = parsed_url.scheme - netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split("/") if p] - return scheme, netloc, path_segments - - -def create_url(scheme, netloc, path_segments): - url_template = f"{scheme}://{netloc}" - path = "/".join(path_segments) - return f"{url_template}/{path}" - - -def get_maven_root(url): - """ - Given `url`, that is a URL to namespace, package, or artifact in a Maven - repo, return the URL to the root of that repo. If a Maven root cannot be - determined, return None. - - >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - 'https://repo1.maven.org/maven2' - """ - scheme, netloc, path_segments = url_parts(url) - for i in range(len(path_segments)): - segments = path_segments[: i + 1] - url_segment = create_url(scheme, netloc, segments) - if is_maven_root(url_segment): - return url_segment - return None - - -def determine_namespace_name_version_from_url(url, root_url=None): - """ - Return a 3-tuple containing strings of a Package namespace, name, and - version, determined from `url`, where `url` points to namespace, package, - specific package version, or artifact on a Maven repo. - - Return None if a Maven root cannot be determined from `url`. - - >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ('net.shibboleth', 'parent', '7.11.0') - """ - if not root_url: - root_url = get_maven_root(url) - if not root_url: - raise Exception(f"Error: not a Maven repository: {url}") - - _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split("/") - remaining_path_segments = [p for p in remaining_path_segments if p] - - namespace_segments = [] - package_name = "" - package_version = "" - for i in range(len(remaining_path_segments)): - segment = remaining_path_segments[i] - segments = remaining_path_segments[: i + 1] - path = "/".join(segments) - url_segment = f"{root_url}/{path}" - if is_package_page(url_segment): - package_name = segment - elif is_package_version_page(url_segment): - package_version = segment - else: - namespace_segments.append(segment) - namespace = ".".join(namespace_segments) - return namespace, package_name, package_version - - -def add_to_import_queue(url, root_url): - """ - Create ImportableURI for the Maven repo package page at `url`. - """ - from minecode.models import ImportableURI - - data = None - response = requests.get(url) - if response: - data = response.text - namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) - purl = PackageURL( - type="maven", - namespace=namespace, - name=name, - ) - importable_uri = ImportableURI.objects.insert(url, data, purl) - if importable_uri: - logger.info(f"Inserted {url} into ImportableURI queue") - - -def filter_only_directories(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - if link != "../" and link.endswith("/"): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -valid_artifact_extensions = [ - "ejb3", - "ear", - "aar", - "apk", - "gem", - "jar", - "nar", - # 'pom', - "so", - "swc", - "tar", - "tar.gz", - "war", - "xar", - "zip", -] - - -def filter_for_artifacts(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are the filenames - of Maven artifacts, return a mapping of filenames whose extension is in - `valid_artifact_extensions` and their timestamps. - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - for ext in valid_artifact_extensions: - if link.endswith(ext): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -def collect_links_from_text(text, filter): - """ - Return a mapping of link locations and their timestamps, given HTML `text` - content, that is filtered using `filter`. - """ - links_and_timestamps = collect_links_and_artifact_timestamps(text) - timestamps_by_links = {} - for link, timestamp in links_and_timestamps: - if timestamp == "-": - timestamp = "" - timestamps_by_links[link] = timestamp - - timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) - return timestamps_by_links - - -def create_absolute_urls_for_links(text, url, filter): - """ - Given the `text` contents from `url`, return a mapping of absolute URLs to - links from `url` and their timestamps, that is then filtered by `filter`. - """ - timestamps_by_absolute_links = {} - url = url.rstrip("/") - timestamps_by_links = collect_links_from_text(text, filter) - for link, timestamp in timestamps_by_links.items(): - if not link.startswith(url): - link = f"{url}/{link}" - timestamps_by_absolute_links[link] = timestamp - return timestamps_by_absolute_links - - -def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_directory_links = {} - response = requests.get(url) - if response: - timestamps_by_directory_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_only_directories - ) - return timestamps_by_directory_links - - -def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_artifact_links = [] - response = requests.get(url) - if response: - timestamps_by_artifact_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_for_artifacts - ) - return timestamps_by_artifact_links - - -def crawl_to_package(url, root_url): - """ - Given a maven repo `url`, - """ - if is_package_page(url): - add_to_import_queue(url, root_url) - return - - for link in get_directory_links(url): - crawl_to_package(link, root_url) - - -def crawl_maven_repo_from_root(root_url): - """ - Given the `url` to a maven root, traverse the repo depth-first and add - packages to the import queue. - """ - crawl_to_package(root_url, root_url) - - -def get_artifact_sha1(artifact_url): - """ - Return the SHA1 value of the Maven artifact located at `artifact_url`. - """ - sha1 = None - artifact_sha1_url = f"{artifact_url}.sha1" - response = requests.get(artifact_sha1_url) - if response: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - return sha1 - - -def get_classifier_from_artifact_url( - artifact_url, package_version_page_url, package_name, package_version -): - """ - Return the classifier from a Maven artifact URL `artifact_url`, otherwise - return None if a classifier cannot be determined from `artifact_url` - """ - classifier = None - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip("/") - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" - # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - # ['', '-onejar.jar'] - _, remaining_url_portion = artifact_url.split(leading_url_portion) - # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split(".") - if remaining_url_portions and remaining_url_portions[0]: - # '-onejar' - classifier = remaining_url_portions[0] - if classifier.startswith("-"): - # 'onejar' - classifier = classifier[1:] - return classifier diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py deleted file mode 100644 index 818e645f..00000000 --- a/minecode/collectors/npm.py +++ /dev/null @@ -1,84 +0,0 @@ -import logging - -import requests -from packagedcode.npm import NpmPackageJsonHandler, npm_api_url -from packageurl import PackageURL - -from minecode import priority_router -from packagedb.models import PackageContentType - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -def get_package_json(namespace, name, version): - """ - Return the contents of the package.json file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - url = npm_api_url( - namespace=namespace, - name=name, - version=version, - ) - - try: - response = requests.get(url) - response.raise_for_status() - return response.json() - except requests.exceptions.HTTPError as err: - logger.error(f"HTTP error occurred: {err}") - - -def map_npm_package(package_url): - """ - Add a npm `package_url` to the PackageDB. - - Return an error string if any errors are encountered during the process - """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package - - package_json = get_package_json( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - ) - - if not package_json: - error = f"Package does not exist on npmjs: {package_url}" - logger.error(error) - return error - - package = NpmPackageJsonHandler._parse(json_data=package_json) - package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE - - db_package, _, _, error = merge_or_create_package(package, visit_level=0) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package) - - return error - - -@priority_router.route("pkg:npm/.*") -def process_request(purl_str): - """ - Process `priority_resource_uri` containing a npm Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from npm and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. - """ - package_url = PackageURL.from_string(purl_str) - if not package_url.version: - return - - error_msg = map_npm_package(package_url) - - if error_msg: - return error_msg diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index dda3d868..c120b67e 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -15,8 +15,8 @@ import requests from minecode.management.commands import VerboseCommand -from minecode.collectors.maven import collect_links_from_text -from minecode.collectors.maven import filter_for_artifacts +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_for_artifacts from packagedb.models import Package diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 178eaa22..9003de5f 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -22,14 +22,14 @@ from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.collectors.maven import get_artifact_links -from minecode.collectors.maven import get_classifier_from_artifact_url -from minecode.collectors.maven import collect_links_from_text -from minecode.collectors.maven import filter_only_directories -from minecode.collectors.maven import get_artifact_sha1 +from minecode.visitors.maven import get_artifact_links +from minecode.visitors.maven import get_classifier_from_artifact_url +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_only_directories +from minecode.visitors.maven import get_artifact_sha1 from minecode.model_utils import merge_or_create_package from packagedcode.models import PackageData -from minecode.collectors.maven import determine_namespace_name_version_from_url +from minecode.visitors.maven import determine_namespace_name_version_from_url logger = logging.getLogger(__name__) diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index ee68b163..51c2be96 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -10,7 +10,7 @@ import logging import sys -from minecode.collectors.maven import crawl_maven_repo_from_root +from minecode.visitors.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand diff --git a/minecode/tests/test_indexing.py b/minecode/tests/test_indexing.py index f4ccc236..24d46b45 100644 --- a/minecode/tests/test_indexing.py +++ b/minecode/tests/test_indexing.py @@ -32,7 +32,7 @@ def setUp(self): version='20040705.181715' ) - def test_ProcessScansTest_index_package_files(self): + def test_indexing_index_package_files(self): scan_data_loc = self.get_test_loc('scancodeio/get_scan_data.json') with open(scan_data_loc, 'rb') as f: scan_data = json.loads(f.read()) @@ -43,7 +43,7 @@ def test_ProcessScansTest_index_package_files(self): expected_resources_loc = self.get_test_loc('scancodeio/get_scan_data_expected_resources.json') self.check_expected_results(results, expected_resources_loc, regen=False) - def test_ProcessScansTest_process_scan(self): + def test_indexing_index_package(self): scan_data_loc = self.get_test_loc('scancodeio/get_scan_data.json') with open(scan_data_loc, 'rb') as f: scan_data = json.load(f) @@ -52,6 +52,14 @@ def test_ProcessScansTest_process_scan(self): with open(scan_summary_loc, 'rb') as f: scan_summary = json.load(f) + project_extra_data = { + 'md5': 'md5', + 'sha1': 'sha1', + 'sha256': 'sha256', + 'sha512': 'sha512', + 'size': 100, + } + # Set up ScannableURI scannable_uri = ScannableURI.objects.create( uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', @@ -59,12 +67,32 @@ def test_ProcessScansTest_process_scan(self): package=self.package1 ) + self.assertFalse(self.package1.md5) + self.assertFalse(self.package1.sha1) + self.assertFalse(self.package1.sha256) + self.assertFalse(self.package1.sha512) + self.assertFalse(self.package1.size) + self.assertFalse(self.package1.declared_license_expression) + self.assertFalse(self.package1.copyright) + self.assertEqual(0, Resource.objects.all().count()) + # Run test - indexing.index_package(scannable_uri, self.package1, scan_data, scan_summary) + indexing.index_package( + scannable_uri, + self.package1, + scan_data, + scan_summary, + project_extra_data, + ) - # Make sure that we get license_expression and copyright from the summary + # Make sure that Package data is updated self.assertEqual('apache-2.0', self.package1.declared_license_expression) self.assertEqual('Copyright (c) Apache Software Foundation', self.package1.copyright) + self.assertEqual('md5', self.package1.md5) + self.assertEqual('sha1', self.package1.sha1) + self.assertEqual('sha256', self.package1.sha256) + self.assertEqual('sha512', self.package1.sha512) + self.assertEqual(100, self.package1.size) result = Resource.objects.filter(package=self.package1) self.assertEqual(64, result.count()) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index 5a93a601..2375f20b 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -17,7 +17,6 @@ from django.test import TestCase as DjangoTestCase -from minecode.collectors import maven as maven_collector from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri from minecode.mappers import maven as maven_mapper @@ -701,7 +700,7 @@ def setUp(self): ) def test_get_pom_text(self, regen=False): - pom_contents = maven_collector.get_pom_text( + pom_contents = maven_visitor.get_pom_text( namespace=self.scan_package.namespace, name=self.scan_package.name, version=self.scan_package.version @@ -712,7 +711,7 @@ def test_get_pom_text(self, regen=False): self.assertEqual(self.expected_pom_contents, pom_contents) def test_get_package_sha1(self): - sha1 = maven_collector.get_package_sha1(self.scan_package) + sha1 = maven_visitor.get_package_sha1(self.scan_package) expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' self.assertEqual(expected_sha1, sha1) @@ -720,7 +719,7 @@ def test_map_maven_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - maven_collector.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) + maven_visitor.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() @@ -732,7 +731,7 @@ def test_map_maven_package_custom_repo_url(self): self.assertEqual(0, package_count) custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" package_url = PackageURL.from_string(custom_repo_purl) - maven_collector.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) + maven_visitor.map_maven_package(package_url, packagedb.models.PackageContentType.BINARY) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() @@ -747,7 +746,7 @@ def test_process_request(self): sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) - maven_collector.process_request(purl_str) + maven_visitor.process_request(purl_str) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(2, package_count) purls = [ @@ -765,7 +764,7 @@ def test_fetch_parent(self, regen=False): pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') with open(pom_loc) as f: pom_text = f.read() - parent_pom_text = maven_collector.fetch_parent(pom_text) + parent_pom_text = maven_visitor.fetch_parent(pom_text) expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') if regen: @@ -780,7 +779,7 @@ def test_get_ancestry(self): pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') with open(pom_loc) as f: pom_text = f.read() - ancestor_pom_texts = list(maven_collector.get_ancestry(pom_text)) + ancestor_pom_texts = list(maven_visitor.get_ancestry(pom_text)) expected_ancestor_pom_texts = [] for expected_loc in [ self.get_test_loc('maven/pom/apache-18.pom'), @@ -814,7 +813,7 @@ def test_merge_parent(self, regen=False): 'Java', text=parent_pom_text ) - package = maven_collector.merge_parent(package, parent_package) + package = maven_visitor.merge_parent(package, parent_package) expected_after_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1-package_after.json') self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) @@ -841,15 +840,15 @@ def test_merge_ancestors(self, regen=False): pom_text = f.read() ancestor_pom_texts.append(pom_text) - maven_collector.merge_ancestors(ancestor_pom_texts, package) + maven_visitor.merge_ancestors(ancestor_pom_texts, package) expected_after_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1-package_after.json') self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) - @mock.patch("minecode.collectors.maven.get_pom_text") + @mock.patch("minecode.visitors.maven.get_pom_text") def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=False): get_pom_text_mock.return_value = "" ancestor_pom_texts = [] - with patch("minecode.collectors.maven.get_ancestry") as mock_get_ancestry: + with patch("minecode.visitors.maven.get_ancestry") as mock_get_ancestry: for loc in [ self.get_test_loc('maven/pom/apache-18.pom'), self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), @@ -866,7 +865,7 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, type="maven", download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", ) - merged_package = maven_collector.get_merged_ancestor_package_from_maven_package(package=db_package) + merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package(package=db_package) expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) @@ -877,60 +876,60 @@ class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): def test_check_if_file_name_is_linked_on_page(self): links = ['foo/', 'bar/', 'baz/'] self.assertTrue( - maven_collector.check_if_file_name_is_linked_on_page('foo/', links) + maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) ) self.assertFalse( - maven_collector.check_if_file_name_is_linked_on_page('qux/', links) + maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) ) def test_check_if_page_has_pom_files(self): links1 = ['foo/', 'bar.jar', 'bar.pom'] links2 = ['foo/', 'bar.jar'] - self.assertTrue(maven_collector.check_if_page_has_pom_files(links1)) - self.assertFalse(maven_collector.check_if_page_has_pom_files(links2)) + self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) + self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) def test_check_if_page_has_directories(self): links1 = ['foo/', 'bar/', 'baz/'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_collector.check_if_page_has_directories(links1)) - self.assertFalse(maven_collector.check_if_page_has_directories(links2)) + self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) + self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) def test_check_if_package_version_page(self): links1 = ['../', 'bar.pom', 'bar.jar'] links2 = ['../', 'foo/', 'bar/', 'baz/'] - self.assertTrue(maven_collector.check_if_package_version_page(links1)) - self.assertFalse(maven_collector.check_if_package_version_page(links2)) + self.assertTrue(maven_visitor.check_if_package_version_page(links1)) + self.assertFalse(maven_visitor.check_if_package_version_page(links2)) def test_check_if_package_page(self): links1 = ['../', 'maven-metadata.xml'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_collector.check_if_package_page(links1)) - self.assertFalse(maven_collector.check_if_package_page(links2)) + self.assertTrue(maven_visitor.check_if_package_page(links1)) + self.assertFalse(maven_visitor.check_if_package_page(links2)) def test_check_if_maven_root(self): links1 = ['../', 'archetype-catalog.xml'] links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_collector.check_if_maven_root(links1)) - self.assertFalse(maven_collector.check_if_maven_root(links2)) + self.assertTrue(maven_visitor.check_if_maven_root(links1)) + self.assertFalse(maven_visitor.check_if_maven_root(links2)) @mock.patch('requests.get') def test_check_on_page(self, mock_request_get): - checker = maven_collector.check_if_page_has_pom_files + checker = maven_visitor.check_if_page_has_pom_files mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'parent-7.11.0.pom' - self.assertTrue(maven_collector.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + self.assertTrue(maven_visitor.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) @mock.patch('requests.get') def test_is_maven_root(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertTrue(maven_collector.is_maven_root('https://repo1.maven.org/maven2/')) + self.assertTrue(maven_visitor.is_maven_root('https://repo1.maven.org/maven2/')) @mock.patch('requests.get') def test_is_package_page(self, mock_request_get): mock_request_get.return_value.ok = True mock_request_get.return_value.text = 'maven-metadata.xml' - self.assertTrue(maven_collector.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + self.assertTrue(maven_visitor.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) @mock.patch('requests.get') def test_is_package_version_page(self, mock_request_get): @@ -939,11 +938,11 @@ def test_is_package_version_page(self, mock_request_get): ../ parent-7.11.0.pom ''' - self.assertTrue(maven_collector.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + self.assertTrue(maven_visitor.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) def test_url_parts(self): url = 'https://example.com/foo/bar/baz.jar' - scheme, netloc, path_segments = maven_collector.url_parts(url) + scheme, netloc, path_segments = maven_visitor.url_parts(url) self.assertEqual('https', scheme) self.assertEqual('example.com', netloc) self.assertEquals(['foo', 'bar', 'baz.jar'], path_segments) @@ -955,7 +954,7 @@ def test_create_url(self): url = 'https://example.com/foo/bar/baz.jar' self.assertEqual( url, - maven_collector.create_url(scheme, netloc, path_segments) + maven_visitor.create_url(scheme, netloc, path_segments) ) @mock.patch('requests.get') @@ -964,7 +963,7 @@ def test_get_maven_root(self, mock_request_get): mock_request_get.return_value.text = 'archetype-catalog.xml' self.assertEqual( 'https://repo1.maven.org/maven2', - maven_collector.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + maven_visitor.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') ) @mock.patch('requests.get') @@ -994,7 +993,7 @@ def test_determine_namespace_name_version_from_url(self, mock_request_get): package_version_page ] - namespace, package_name, package_version = maven_collector.determine_namespace_name_version_from_url(url, root_url) + namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url(url, root_url) self.assertEqual('xml-apis', namespace) self.assertEqual('xml-apis', package_name) self.assertEqual('1.0.b2', package_version) @@ -1030,7 +1029,7 @@ def test_add_to_import_queue(self, mock_request_get): ] self.assertEqual(0, ImportableURI.objects.all().count()) - maven_collector.add_to_import_queue(url, root_url ) + maven_visitor.add_to_import_queue(url, root_url ) self.assertEqual(1, ImportableURI.objects.all().count()) importable_uri = ImportableURI.objects.get(uri=url) self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) @@ -1046,7 +1045,7 @@ def test_filter_only_directories(self): } self.assertEqual( expected, - maven_collector.filter_only_directories(timestamps_by_links) + maven_visitor.filter_only_directories(timestamps_by_links) ) def test_filter_for_artifacts(self): @@ -1084,10 +1083,10 @@ def test_filter_for_artifacts(self): 'foo.xar': '2023-09-28', 'foo.zip': '2023-09-28', } - self.assertEqual(expected, maven_collector.filter_for_artifacts(timestamps_by_links)) + self.assertEqual(expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) def test_collect_links_from_text(self): - filter = maven_collector.filter_only_directories + filter = maven_visitor.filter_only_directories text = ''' ../ 1.0.b2/ @@ -1101,11 +1100,11 @@ def test_collect_links_from_text(self): } self.assertEqual( expected, - maven_collector.collect_links_from_text(text, filter=filter) + maven_visitor.collect_links_from_text(text, filter=filter) ) def test_create_absolute_urls_for_links(self): - filter = maven_collector.filter_only_directories + filter = maven_visitor.filter_only_directories text = ''' ../ 1.0.b2/ @@ -1120,7 +1119,7 @@ def test_create_absolute_urls_for_links(self): } self.assertEqual( expected, - maven_collector.create_absolute_urls_for_links(text, url, filter=filter) + maven_visitor.create_absolute_urls_for_links(text, url, filter=filter) ) @mock.patch('requests.get') @@ -1138,7 +1137,7 @@ def test_get_directory_links(self, mock_request_get): 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' } - self.assertEqual(expected, maven_collector.get_directory_links(url)) + self.assertEqual(expected, maven_visitor.get_directory_links(url)) @mock.patch('requests.get') def test_get_artifact_links(self, mock_request_get): @@ -1154,7 +1153,7 @@ def test_get_artifact_links(self, mock_request_get): expected = { 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', } - self.assertEqual(expected, maven_collector.get_artifact_links(url)) + self.assertEqual(expected, maven_visitor.get_artifact_links(url)) def test_crawl_to_package(self): pass @@ -1167,14 +1166,14 @@ def test_get_artifact_sha1(self, mock_request_get): sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' mock_request_get.return_value.ok = True mock_request_get.return_value.text = sha1 - self.assertEqual(sha1, maven_collector.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + self.assertEqual(sha1, maven_visitor.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) def test_get_classifier_from_artifact_url(self): artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' package_name = 'livereload-jvm' package_version = '0.2.0' - classifier = maven_collector.get_classifier_from_artifact_url( + classifier = maven_visitor.get_classifier_from_artifact_url( artifact_url, package_version_page_url, package_name, diff --git a/minecode/tests/test_npm.py b/minecode/tests/test_npm.py index eb36ef1c..745aabe2 100644 --- a/minecode/tests/test_npm.py +++ b/minecode/tests/test_npm.py @@ -22,7 +22,6 @@ import packagedb from minecode import mappers from minecode import route -from minecode.collectors import npm as npm_collector from minecode.models import ResourceURI from minecode.utils_test import JsonBasedTesting from minecode.utils_test import mocked_requests_get @@ -185,7 +184,7 @@ def setUp(self): ) def test_get_package_json(self, regen=False): - json_contents = npm_collector.get_package_json( + json_contents = npm.get_package_json( namespace=self.scan_package.namespace, name=self.scan_package.name, version=self.scan_package.version @@ -199,7 +198,7 @@ def test_map_npm_package(self): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(0, package_count) package_url = PackageURL.from_string(self.scan_package.purl) - npm_collector.map_npm_package(package_url) + npm.map_npm_package(package_url) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(1, package_count) package = packagedb.models.Package.objects.all().first() diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 0be85a70..d640034f 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -8,14 +8,20 @@ # from collections import namedtuple +from typing import Dict +from urllib.parse import urlparse import gzip +import hashlib import io import json import logging +import os +import re -import arrow from bs4 import BeautifulSoup from dateutil import tz +import arrow +import requests from jawa.util.utf import decode_modified_utf8 import javaproperties @@ -23,13 +29,20 @@ from packageurl import PackageURL from packagedcode.maven import build_filename from packagedcode.maven import build_url +from packagedcode.maven import get_urls +from packagedcode.maven import get_maven_pom +from packagedcode.maven import _parse +from minecode import priority_router from minecode import seed from minecode import visit_router from minecode.visitors import java_stream from minecode.visitors import HttpVisitor from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation +from packagedb.models import make_relationship """ This module handles the Maven repositories such as central and other @@ -96,6 +109,678 @@ def get_seeds(self): # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage +def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): + """ + Return the contents of the POM file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + if qualifiers and not isinstance(qualifiers, Dict): + return + urls = get_urls( + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + base_url=base_url, + ) + # Get and parse POM info + pom_url = urls["api_data_url"] + # TODO: manage different types of errors (404, etc.) + response = requests.get(pom_url) + if not response: + return + return response.text + + +def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): + """ + Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. + """ + if not pom_text: + return + pom = get_maven_pom(text=pom_text) + if ( + pom.parent + and pom.parent.group_id + and pom.parent.artifact_id + and pom.parent.version.version + ): + parent_namespace = pom.parent.group_id + parent_name = pom.parent.artifact_id + parent_version = str(pom.parent.version.version) + parent_pom_text = get_pom_text( + namespace=parent_namespace, + name=parent_name, + version=parent_version, + qualifiers={}, + base_url=base_url, + ) + return parent_pom_text + + +def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): + """ + Return a list of pom text of the ancestors of `pom`. The list is ordered + from oldest ancestor to newest. The list is empty is there is no parent pom. + """ + ancestors = [] + has_parent = True + while has_parent: + parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) + if not parent_pom_text: + has_parent = False + else: + ancestors.append(parent_pom_text) + pom_text = parent_pom_text + return reversed(ancestors) + + +def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): + """ + Merge package details of a package with its ancestor pom + and return the merged package. + """ + if not package: + return + pom_text = get_pom_text( + name=package.name, + namespace=package.namespace, + version=package.version, + qualifiers=package.qualifiers, + base_url=base_url, + ) + merged_package = merge_ancestors( + ancestor_pom_texts=get_ancestry(pom_text), + package=package, + ) + return merged_package + + +def merge_parent(package, parent_package): + """ + Merge `parent_package` data into `package` and return `package. + """ + mergeable_fields = ( + "declared_license_expression", + "homepage_url", + "parties", + ) + for field in mergeable_fields: + # If `field` is empty on the package we're looking at, populate + # those fields with values from the parent package. + if not getattr(package, field): + value = getattr(parent_package, field) + setattr(package, field, value) + + msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" + history = package.extra_data.get("history") + if history: + package.extra_data["history"].append(msg) + else: + package.extra_data["history"] = [msg] + + return package + + +def merge_ancestors(ancestor_pom_texts, package): + """ + Merge metadata from `ancestor_pom_text` into `package`. + + The order of POM content in `ancestor_pom_texts` is expected to be in the + order of oldest ancestor to newest. + """ + for ancestor_pom_text in ancestor_pom_texts: + ancestor_package = _parse( + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", + text=ancestor_pom_text, + ) + package = merge_parent(package, ancestor_package) + return package + + +def map_maven_package(package_url, package_content): + """ + Add a maven `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + """ + from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + + db_package = None + error = "" + + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + else: + base_url = MAVEN_BASE_URL + + pom_text = get_pom_text( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + if not pom_text: + msg = f"Package does not exist on maven: {package_url}" + error += msg + "\n" + logger.error(msg) + return db_package, error + + package = _parse( + "maven_pom", + "maven", + "Java", + text=pom_text, + base_url=base_url, + ) + ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) + package = merge_ancestors(ancestor_pom_texts=ancestor_pom_texts, package=package) + + urls = get_urls( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + # In the case of looking up a maven package with qualifiers of + # `classifiers=sources`, the purl of the package created from the pom does + # not have the qualifiers, so we need to set them. Additionally, the download + # url is not properly generated since it would be missing the sources bit + # from the filename. + package.qualifiers = package_url.qualifiers + package.download_url = urls["repository_download_url"] + package.repository_download_url = urls["repository_download_url"] + + # Set package_content value + package.extra_data["package_content"] = package_content + + # If sha1 exists for a jar, we know we can create the package + # Use pom info as base and create packages for binary and source package + + # Check to see if binary is available + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f"Failed to retrieve JAR: {package_url}" + error += msg + "\n" + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return db_package, error + + +def map_maven_binary_and_source(package_url): + """ + Get metadata for the binary and source release of the Maven package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + package, emsg = map_maven_package(package_url, PackageContentType.BINARY) + if emsg: + error += emsg + + source_package_url = package_url + source_package_url.qualifiers["classifier"] = "sources" + source_package, emsg = map_maven_package( + source_package_url, PackageContentType.SOURCE_ARCHIVE + ) + if emsg: + error += emsg + + if package and source_package: + make_relationship( + from_package=source_package, + to_package=package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +def map_maven_packages(package_url): + """ + Given a valid `package_url` with no version, get metadata for the binary and + source release for each version of the Maven package `package_url` and save + it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + namespace = package_url.namespace + name = package_url.name + # Find all versions of this package + query_params = f"g:{namespace}+AND+a:{name}" + url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" + response = requests.get(url) + if response: + package_listings = response.json().get("response", {}).get("docs", []) + for listing in package_listings: + purl = PackageURL( + type="maven", + namespace=listing.get("g"), + name=listing.get("a"), + version=listing.get("v"), + ) + emsg = map_maven_binary_and_source(purl) + if emsg: + error += emsg + return error + + +def get_package_sha1(package): + """ + Return the sha1 value for `package` by checking if the sha1 file exists for + `package` on maven and returning the contents if it does. + + If the sha1 is invalid, we download the package's JAR and calculate the sha1 + from that. + """ + download_url = package.repository_download_url + sha1_download_url = f"{download_url}.sha1" + response = requests.get(sha1_download_url) + if response.ok: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + if not sha1: + # Download JAR and calculate sha1 if we cannot get it from the repo + response = requests.get(download_url) + if response: + sha1_hash = hashlib.new("sha1", response.content) + sha1 = sha1_hash.hexdigest() + return sha1 + + +@priority_router.route("pkg:maven/.*") +def process_request(purl_str): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from maven and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occured when parsing {purl_str}: {e}" + return error + + has_version = bool(package_url.version) + if has_version: + error = map_maven_binary_and_source(package_url) + else: + error = map_maven_packages(package_url) + + return error + + +collect_links = re.compile(r'href="([^"]+)"').findall +collect_links_and_artifact_timestamps = re.compile( + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """ + Return True if `file_name` is in `links` + """ + return any(l.endswith(file_name) for l in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """ + Return True of any entry in `links` ends with .pom. + """ + return any(l.endswith(".pom") for l in links) + + +def check_if_page_has_directories(links, **kwargs): + """ + Return True if any entry, excluding "../", ends with /. + """ + return any(l.endswith("/") for l in links if l != "../") + + +def check_if_package_version_page(links, **kwargs): + """ + Return True if `links` contains pom files and has no directories + """ + return check_if_page_has_pom_files( + links=links + ) and not check_if_page_has_directories(links=links) + + +def check_if_package_page(links, **kwargs): + return check_if_file_name_is_linked_on_page( + file_name="maven-metadata.xml", links=links + ) and not check_if_page_has_pom_files(links=links) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page( + file_name="archetype-catalog.xml", links=links + ) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """ + Return True if `url` is the root of a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """ + Return True if `url` is a package page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """ + Return True if `url` is a package version page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split("/") if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f"{scheme}://{netloc}" + path = "/".join(path_segments) + return f"{url_template}/{path}" + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[: i + 1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url, root_url=None): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + if not root_url: + root_url = get_maven_root(url) + if not root_url: + raise Exception(f"Error: not a Maven repository: {url}") + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split("/") + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = "" + package_version = "" + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[: i + 1] + path = "/".join(segments) + url_segment = f"{root_url}/{path}" + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = ".".join(namespace_segments) + return namespace, package_name, package_version + + +def add_to_import_queue(url, root_url): + """ + Create ImportableURI for the Maven repo package page at `url`. + """ + from minecode.models import ImportableURI + + data = None + response = requests.get(url) + if response: + data = response.text + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type="maven", + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) + if importable_uri: + logger.info(f"Inserted {url} into ImportableURI queue") + + +def filter_only_directories(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != "../" and link.endswith("/"): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +valid_artifact_extensions = [ + "ejb3", + "ear", + "aar", + "apk", + "gem", + "jar", + "nar", + # 'pom', + "so", + "swc", + "tar", + "tar.gz", + "war", + "xar", + "zip", +] + + +def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +def collect_links_from_text(text, filter): + """ + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. + """ + links_and_timestamps = collect_links_and_artifact_timestamps(text) + timestamps_by_links = {} + for link, timestamp in links_and_timestamps: + if timestamp == "-": + timestamp = "" + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip("/") + timestamps_by_links = collect_links_from_text(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f"{url}/{link}" + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + +def get_directory_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_only_directories + ) + return timestamps_by_directory_links + + +def get_artifact_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_for_artifacts + ) + return timestamps_by_artifact_links + + +def crawl_to_package(url, root_url): + """ + Given a maven repo `url`, + """ + if is_package_page(url): + add_to_import_queue(url, root_url) + return + + for link in get_directory_links(url): + crawl_to_package(link, root_url) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url, root_url) + + +def validate_sha1(sha1): + """ + Validate a `sha1` string. + + Return `sha1` if it is valid, None otherwise. + """ + if sha1 and len(sha1) != 40: + logger.warning(f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!') + sha1 = None + return sha1 + + +def get_artifact_sha1(artifact_url): + """ + Return the SHA1 value of the Maven artifact located at `artifact_url`. + """ + sha1 = None + artifact_sha1_url = f"{artifact_url}.sha1" + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url( + artifact_url, package_version_page_url, package_name, package_version +): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip("/") + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split(".") + if remaining_url_portions and remaining_url_portions[0]: + # '-onejar' + classifier = remaining_url_portions[0] + if classifier.startswith("-"): + # 'onejar' + classifier = classifier[1:] + return classifier + + @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): diff --git a/minecode/visitors/npm.py b/minecode/visitors/npm.py index afcc342b..55f51e05 100644 --- a/minecode/visitors/npm.py +++ b/minecode/visitors/npm.py @@ -7,19 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - import logging import json -from packageurl import PackageURL - from packagedcode.npm import npm_api_url from packagedcode.npm import split_scoped_package_name +from packagedcode.npm import NpmPackageJsonHandler +from packagedcode.npm import npm_api_url +from packageurl import PackageURL +import requests from minecode import seed +from minecode import priority_router from minecode import visit_router from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI +from packagedb.models import PackageContentType """ @@ -103,3 +106,74 @@ def get_uris(self, content): data=json.dumps(doc, separators=(',', ':'), ensure_ascii=False), # note: visited is True since there nothing more to visit visited=True) + + +def get_package_json(namespace, name, version): + """ + Return the contents of the package.json file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + url = npm_api_url( + namespace=namespace, + name=name, + version=version, + ) + + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as err: + logger.error(f"HTTP error occurred: {err}") + + +def map_npm_package(package_url): + """ + Add a npm `package_url` to the PackageDB. + + Return an error string if any errors are encountered during the process + """ + from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + + package_json = get_package_json( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + ) + + if not package_json: + error = f"Package does not exist on npmjs: {package_url}" + logger.error(error) + return error + + package = NpmPackageJsonHandler._parse(json_data=package_json) + package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE + + db_package, _, _, error = merge_or_create_package(package, visit_level=0) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return error + + +@priority_router.route("pkg:npm/.*") +def process_request(purl_str): + """ + Process `priority_resource_uri` containing a npm Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from npm and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. + """ + package_url = PackageURL.from_string(purl_str) + if not package_url.version: + return + + error_msg = map_npm_package(package_url) + + if error_msg: + return error_msg diff --git a/packagedb/find_source_repo.py b/packagedb/find_source_repo.py index 2ec7fe18..e16de545 100644 --- a/packagedb/find_source_repo.py +++ b/packagedb/find_source_repo.py @@ -21,7 +21,7 @@ from scancode.api import get_urls as get_urls_from_location from minecode.model_utils import add_package_to_scan_queue -from minecode.collectors.maven import get_merged_ancestor_package_from_maven_package +from minecode.visitors.maven import get_merged_ancestor_package_from_maven_package from packagedb.models import Package from packagedb.models import PackageContentType from packagedb.models import PackageSet diff --git a/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json b/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json index 325beb1c..4582b369 100644 --- a/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json +++ b/packagedb/tests/testfiles/api/resource-filter_by_checksums-expected.json @@ -11,9 +11,9 @@ "sha256":"testsha2561", "sha512":"testsha5121", "git_sha1":"testgit_sha11", - "mime_type":"", - "file_type":"", - "programming_language":"", + "mime_type":null, + "file_type":null, + "programming_language":null, "is_binary":false, "is_text":false, "is_archive":false, @@ -44,9 +44,9 @@ "sha256":"testsha2562", "sha512":"testsha5122", "git_sha1":"testgit_sha12", - "mime_type":"", - "file_type":"", - "programming_language":"", + "mime_type":null, + "file_type":null, + "programming_language":null, "is_binary":false, "is_text":false, "is_archive":false, From e3f46fd54531c5f20ca7d563e97ee76c9f5ae322 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 18:13:13 -0700 Subject: [PATCH 26/31] Add tests to ensure proper API permissions for scan_queue #49 Signed-off-by: Jono Yang --- minecode/tests/test_api.py | 103 +++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 27 deletions(-) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index f492191b..ea9aec36 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -24,16 +24,39 @@ class ScannableURIAPITestCase(JsonBasedTesting, TestCase): test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') def setUp(self): - self.user = User.objects.create_user( + self.scan_queue_worker_user = User.objects.create_user( username="username", email="e@mail.com", password="secret" ) scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') - scan_queue_workers_group.user_set.add(self.user) - self.auth = f"Token {self.user.auth_token.key}" - self.csrf_client = APIClient(enforce_csrf_checks=True) - self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) + scan_queue_workers_group.user_set.add(self.scan_queue_worker_user) + self.scan_queue_worker_auth = f"Token {self.scan_queue_worker_user.auth_token.key}" + self.scan_queue_worker_client = APIClient(enforce_csrf_checks=True) + self.scan_queue_worker_client.credentials(HTTP_AUTHORIZATION=self.scan_queue_worker_auth) + + # create a staff user + self.staff_user = User.objects.create_user( + username="staff_username", + email="staff_e@mail.com", + password="secret", + is_staff=True + ) + self.staff_auth = f"Token {self.staff_user.auth_token.key}" + self.staff_client = APIClient(enforce_csrf_checks=True) + self.staff_client.credentials(HTTP_AUTHORIZATION=self.staff_auth) + + # create a regular user + self.regular_user = User.objects.create_user( + username="regular_username", + email="regular_e@mail.com", + password="secret", + ) + self.regular_auth = f"Token {self.regular_user.auth_token.key}" + self.regular_client = APIClient(enforce_csrf_checks=True) + self.regular_client.credentials(HTTP_AUTHORIZATION=self.regular_auth) + + self.anonymous_client = APIClient() self.package1 = Package.objects.create( download_url='https://test-url.com/package1.tar.gz', @@ -68,52 +91,68 @@ def setUp(self): package=self.package3 ) - self.client = APIClient() + def test_api_scannable_uri_permissions(self): + response = self.anonymous_client.get('/api/scan_queue/') + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - def test_api_scannable_uri_list_endpoint(self): - response = self.client.get('/api/scan_queue/') + response = self.anonymous_client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + + response = self.anonymous_client.post('/api/scan_queue/update_status/') self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.csrf_client.get('/api/scan_queue/') + response = self.regular_client.get('/api/scan_queue/') + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + response = self.regular_client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + response = self.regular_client.post('/api/scan_queue/update_status/') + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_api_scannable_uri_list_endpoint(self): + response = self.scan_queue_worker_client.get('/api/scan_queue/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(3, response.data.get('count')) - def test_api_scannable_uri_get_next_download_url(self): - response = self.client.get('/api/scan_queue/get_next_download_url/') - self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + response = self.staff_client.get('/api/scan_queue/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(3, response.data.get('count')) - response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') + def test_api_scannable_uri_get_next_download_url(self): + response = self.scan_queue_worker_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri1.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri1.uri) - response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') + response = self.scan_queue_worker_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri2.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri2.uri) - response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') + response = self.scan_queue_worker_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), self.scannable_uri3.uuid) self.assertEqual(response.data.get('download_url'), self.scannable_uri3.uri) - response = self.csrf_client.get('/api/scan_queue/get_next_download_url/') + response = self.scan_queue_worker_client.get('/api/scan_queue/get_next_download_url/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data.get('scannable_uri_uuid'), '') + self.assertEqual(response.data.get('download_url'), '') + + response = self.staff_client.get('/api/scan_queue/get_next_download_url/') self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.data.get('scannable_uri_uuid'), '') self.assertEqual(response.data.get('download_url'), '') def test_api_scannable_uri_update_status(self): self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) - - response = self.client.post('/api/scan_queue/update_status/') - self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - data = { "scannable_uri_uuid": self.scannable_uri1.uuid, "scan_status": 'failed', 'scan_log': 'scan_log', } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) @@ -145,7 +184,7 @@ def test_api_scannable_uri_update_status(self): 'scan_results_file': scan_file, 'scan_summary_file': summary_file, } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri2.refresh_from_db() self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri2.scan_status) @@ -161,7 +200,7 @@ def test_api_scannable_uri_update_status(self): self.assertEqual(64, Resource.objects.all().count()) data = {} - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) expected_response = {'error': 'missing scannable_uri_uuid'} self.assertEqual(expected_response, response.data) @@ -169,7 +208,7 @@ def test_api_scannable_uri_update_status(self): 'scannable_uri_uuid': self.scannable_uri2.uuid, 'scan_status': '' } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) expected_response = {'error': 'missing scan_status'} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) @@ -178,7 +217,7 @@ def test_api_scannable_uri_update_status(self): 'scannable_uri_uuid': self.scannable_uri2.uuid, 'scan_status': 'invalid' } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) expected_response = {'error': 'invalid scan_status: invalid'} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) @@ -187,7 +226,17 @@ def test_api_scannable_uri_update_status(self): 'scannable_uri_uuid': 'asdf', 'scan_status': 'scanned' } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) + expected_response = {'error': 'invalid scannable_uri_uuid: asdf'} + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(expected_response, response.data) + + # Test that staff user can use endpoint + data = { + 'scannable_uri_uuid': 'asdf', + 'scan_status': 'scanned' + } + response = self.staff_client.post('/api/scan_queue/update_status/', data=data) expected_response = {'error': 'invalid scannable_uri_uuid: asdf'} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) @@ -206,7 +255,7 @@ def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): 'scannable_uri_uuid': scannable_uri_uuid, 'scan_status': 'scanned' } - response = self.csrf_client.post('/api/scan_queue/update_status/', data=data) + response = self.scan_queue_worker_client.post('/api/scan_queue/update_status/', data=data) expected_response = { 'error': 'cannot update status for scannable_uri ' f'{self.scannable_uri3.uuid}: scannable_uri has finished ' From 093d94b4c7607f347f27f8f23e8a8f303a912574 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 18:33:17 -0700 Subject: [PATCH 27/31] Remove previous scan queue Docker services #49 #285 Signed-off-by: Jono Yang --- Makefile | 6 ------ docker-compose_purldb.yml | 26 -------------------------- docker-compose_purldb_public.yml | 26 -------------------------- 3 files changed, 58 deletions(-) diff --git a/Makefile b/Makefile index 6ebc900c..e97c0294 100644 --- a/Makefile +++ b/Makefile @@ -122,12 +122,6 @@ run_visit: seed run_map: ${MANAGE} run_map -request_scans: - ${MANAGE} request_scans - -process_scans: - ${MANAGE} process_scans - test: @echo "-> Run the test suite" ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode_pipeline --ignore matchcode_project --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index bb1eb3be..261fa04e 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -81,32 +81,6 @@ services: - db - web # Ensure that potential db migrations run first - request_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py request_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - - process_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py process_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - priority_queue: build: . command: wait-for-it web:8000 -- python manage_purldb.py priority_queue diff --git a/docker-compose_purldb_public.yml b/docker-compose_purldb_public.yml index 4ffbf261..242243d8 100644 --- a/docker-compose_purldb_public.yml +++ b/docker-compose_purldb_public.yml @@ -74,32 +74,6 @@ services: - db - web # Ensure that potential db migrations run first - request_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb_public.py request_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - - process_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb_public.py process_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - priority_queue: build: . command: wait-for-it web:8000 -- python manage_purldb_public.py priority_queue From 6fbd43b50d521299531e00af362f8f82be2a58c6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 18:40:24 -0700 Subject: [PATCH 28/31] Fix quotes #49 #285 Signed-off-by: Jono Yang --- minecode/api.py | 8 ++--- .../commands/create-scan-queue-worker-user.py | 35 ++++++------------- minecode/management/commands/create-user.py | 33 ++++++----------- minecode/visitors/npm.py | 13 ++++--- 4 files changed, 33 insertions(+), 56 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index d287cc5a..8d5aa7a7 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -101,7 +101,7 @@ class ScannableURIViewSet(viewsets.ModelViewSet): serializer_class = ScannableURISerializer permission_classes = [IsScanQueueWorkerAPIUser|IsAdminUser] - @action(detail=False, methods=["get"]) + @action(detail=False, methods=['get']) def get_next_download_url(self, request, *args, **kwargs): """ Return download url for next Package on scan queue @@ -119,13 +119,13 @@ def get_next_download_url(self, request, *args, **kwargs): scannable_uri.save() else: response = { - 'scannable_uri_uuid': "", - 'download_url': "", + 'scannable_uri_uuid': '', + 'download_url': '', 'pipelines': [], } return Response(response) - @action(detail=False, methods=["post"]) + @action(detail=False, methods=['post']) def update_status(self, request, *args, **kwargs): """ Update the status of a ScannableURI with UUID of `scannable_uri_uuid` diff --git a/minecode/management/commands/create-scan-queue-worker-user.py b/minecode/management/commands/create-scan-queue-worker-user.py index e7409730..76e7b928 100644 --- a/minecode/management/commands/create-scan-queue-worker-user.py +++ b/minecode/management/commands/create-scan-queue-worker-user.py @@ -1,36 +1,23 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. # -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. from django.contrib.auth.models import Group from minecode.management.user_creation import CreateUserCommand class Command(CreateUserCommand): - help = "Create a user and generate an API key for a scan queue worker" + help = 'Create a user and generate an API key for a scan queue worker' def handle(self, *args, **options): - username = options["username"] - interactive = options["interactive"] - verbosity = options["verbosity"] + username = options['username'] + interactive = options['interactive'] + verbosity = options['verbosity'] user = self.create_user( username=username, interactive=interactive, @@ -39,5 +26,5 @@ def handle(self, *args, **options): # Add user to `scan_queue_workers` group scan_queue_workers_group, _ = Group.objects.get_or_create(name='scan_queue_workers') scan_queue_workers_group.user_set.add(user) - msg = f"User {username} added to `scan_queue_workers` group" + msg = f'User {username} added to `scan_queue_workers` group' self.stdout.write(msg, self.style.SUCCESS) diff --git a/minecode/management/commands/create-user.py b/minecode/management/commands/create-user.py index 794b9b65..64b8b238 100644 --- a/minecode/management/commands/create-user.py +++ b/minecode/management/commands/create-user.py @@ -1,35 +1,22 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. # -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. from minecode.management.user_creation import CreateUserCommand class Command(CreateUserCommand): - help = "Create a user and generate an API key for a scan queue worker" + help = 'Create a user and generate an API key for a scan queue worker' def handle(self, *args, **options): - username = options["username"] - interactive = options["interactive"] - verbosity = options["verbosity"] + username = options['username'] + interactive = options['interactive'] + verbosity = options['verbosity'] self.create_user( username=username, interactive=interactive, diff --git a/minecode/visitors/npm.py b/minecode/visitors/npm.py index 55f51e05..e45fe7e8 100644 --- a/minecode/visitors/npm.py +++ b/minecode/visitors/npm.py @@ -134,7 +134,8 @@ def map_npm_package(package_url): Return an error string if any errors are encountered during the process """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package package_json = get_package_json( namespace=package_url.namespace, @@ -143,12 +144,14 @@ def map_npm_package(package_url): ) if not package_json: - error = f"Package does not exist on npmjs: {package_url}" + error = f'Package does not exist on npmjs: {package_url}' logger.error(error) return error - package = NpmPackageJsonHandler._parse(json_data=package_json) - package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE + package = NpmPackageJsonHandler._parse( + json_data=package_json + ) + package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE db_package, _, _, error = merge_or_create_package(package, visit_level=0) @@ -159,7 +162,7 @@ def map_npm_package(package_url): return error -@priority_router.route("pkg:npm/.*") +@priority_router.route('pkg:npm/.*') def process_request(purl_str): """ Process `priority_resource_uri` containing a npm Package URL (PURL) as a From 8dcc1a3f0321c6e03c5708699a4b1527733576b7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 18:57:56 -0700 Subject: [PATCH 29/31] Update default pipelines to be run for packages #49 #285 Signed-off-by: Jono Yang --- minecode/management/commands/priority_queue.py | 14 -------------- minecode/model_utils.py | 3 ++- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 965a17b8..ca7d9382 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -105,20 +105,6 @@ def handle(self, *args, **options): return processed_counter -def add_package_to_scan_queue(package): - """ - Add a Package `package` to the scan queue - """ - uri = package.download_url - _, scannable_uri_created = ScannableURI.objects.get_or_create( - uri=uri, - package=package, - pipelines=['scan_and_fingerprint_package'], - ) - if scannable_uri_created: - logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) - - def process_request(priority_resource_uri, _priority_router=priority_router): purl_to_visit = priority_resource_uri.uri source_purl = priority_resource_uri.source_uri diff --git a/minecode/model_utils.py b/minecode/model_utils.py index caa542d2..f0c78b77 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -27,7 +27,8 @@ # These are the list of default pipelines to run when we scan a Package for # indexing DEFAULT_PIPELINES = ( - 'scan_and_fingerprint_package', + 'scan_single_package', + 'fingerprint_codebase', ) From abb6439953738bbaf963ee44f3985389ea9adaf7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Mar 2024 19:05:30 -0700 Subject: [PATCH 30/31] Use reindex_uri attribute when calling index_package #49 #285 * Update quotes Signed-off-by: Jono Yang --- minecode/api.py | 9 +- minecode/management/user_creation.py | 59 ++++------ minecode/visitors/maven.py | 160 +++++++++++++-------------- 3 files changed, 108 insertions(+), 120 deletions(-) diff --git a/minecode/api.py b/minecode/api.py index 8d5aa7a7..766f3e78 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -202,18 +202,19 @@ def update_status(self, request, *args, **kwargs): scan_results_file = request.data.get('scan_results_file') scan_summary_file = request.data.get('scan_summary_file') project_extra_data = request.data.get('project_extra_data') - scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED - package = scannable_uri.package scan_data = json.load(scan_results_file) summary_data = json.load(scan_summary_file) project_extra_data = json.loads(project_extra_data) + + scannable_uri.scan_status = ScannableURI.SCAN_COMPLETED + indexing_errors = index_package( scannable_uri, - package, + scannable_uri.package, scan_data, summary_data, project_extra_data, - reindex=True + reindex=scannable_uri.reindex_uri, ) if indexing_errors: scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED diff --git a/minecode/management/user_creation.py b/minecode/management/user_creation.py index 7f63b75a..3a3fe6f2 100644 --- a/minecode/management/user_creation.py +++ b/minecode/management/user_creation.py @@ -1,24 +1,11 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. # -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. import getpass @@ -32,7 +19,7 @@ class CreateUserCommand(BaseCommand): - help = "Create a user and generate an API key for authentication." + help = 'Create a user and generate an API key for authentication.' requires_migrations_checks = True def __init__(self, *args, **kwargs): @@ -43,18 +30,18 @@ def __init__(self, *args, **kwargs): ) def add_arguments(self, parser): - parser.add_argument("username", help="Specifies the username for the user.") + parser.add_argument('username', help='Specifies the username for the user.') parser.add_argument( - "--no-input", - action="store_false", - dest="interactive", - help="Do not prompt the user for input of any kind.", + '--no-input', + action='store_false', + dest='interactive', + help='Do not prompt the user for input of any kind.', ) def handle(self, *args, **options): - username = options["username"] - interactive = options["interactive"] - verbosity = options["verbosity"] + username = options['username'] + interactive = options['interactive'] + verbosity = options['verbosity'] self.create_user( username=username, interactive=interactive, @@ -74,7 +61,7 @@ def create_user(self, username, interactive, verbosity): token, _ = Token._default_manager.get_or_create(user=user) if verbosity >= 1: - msg = f"User {username} created with API key: {token.key}" + msg = f'User {username} created with API key: {token.key}' self.stdout.write(msg, self.style.SUCCESS) return user @@ -89,21 +76,21 @@ def get_password_from_stdin(self, username): password = None while password is None: password1 = getpass.getpass() - password2 = getpass.getpass("Password (again): ") + password2 = getpass.getpass('Password (again): ') if password1 != password2: self.stderr.write("Error: Your passwords didn't match.") continue - if password1.strip() == "": + if password1.strip() == '': self.stderr.write("Error: Blank passwords aren't allowed.") continue try: validate_password(password2, self.UserModel(**fake_user_data)) except exceptions.ValidationError as err: - self.stderr.write("\n".join(err.messages)) + self.stderr.write('\n'.join(err.messages)) response = input( - "Bypass password validation and create user anyway? [y/N]: " + 'Bypass password validation and create user anyway? [y/N]: ' ) - if response.lower() != "y": + if response.lower() != 'y': continue password = password1 @@ -117,9 +104,9 @@ def _validate_username(self, username): except self.UserModel.DoesNotExist: pass else: - return "Error: That username is already taken." + return 'Error: That username is already taken.' try: self.username_field.clean(username, None) except exceptions.ValidationError as e: - return "; ".join(e.messages) + return '; '.join(e.messages) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 67e7bf43..ba8a209b 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -127,7 +127,7 @@ def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_UR base_url=base_url, ) # Get and parse POM info - pom_url = urls["api_data_url"] + pom_url = urls['api_data_url'] # TODO: manage different types of errors (404, etc.) response = requests.get(pom_url) if not response: @@ -204,9 +204,9 @@ def merge_parent(package, parent_package): Merge `parent_package` data into `package` and return `package. """ mergeable_fields = ( - "declared_license_expression", - "homepage_url", - "parties", + 'declared_license_expression', + 'homepage_url', + 'parties', ) for field in mergeable_fields: # If `field` is empty on the package we're looking at, populate @@ -215,12 +215,12 @@ def merge_parent(package, parent_package): value = getattr(parent_package, field) setattr(package, field, value) - msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" - history = package.extra_data.get("history") + msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' + history = package.extra_data.get('history') if history: - package.extra_data["history"].append(msg) + package.extra_data['history'].append(msg) else: - package.extra_data["history"] = [msg] + package.extra_data['history'] = [msg] return package @@ -234,9 +234,9 @@ def merge_ancestors(ancestor_pom_texts, package): """ for ancestor_pom_text in ancestor_pom_texts: ancestor_package = _parse( - datasource_id="maven_pom", - package_type="maven", - primary_language="Java", + datasource_id='maven_pom', + package_type='maven', + primary_language='Java', text=ancestor_pom_text, ) package = merge_parent(package, ancestor_package) @@ -252,10 +252,10 @@ def map_maven_package(package_url, package_content): from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package db_package = None - error = "" + error = '' - if "repository_url" in package_url.qualifiers: - base_url = package_url.qualifiers["repository_url"] + if 'repository_url' in package_url.qualifiers: + base_url = package_url.qualifiers['repository_url'] else: base_url = MAVEN_BASE_URL @@ -267,15 +267,15 @@ def map_maven_package(package_url, package_content): base_url=base_url, ) if not pom_text: - msg = f"Package does not exist on maven: {package_url}" - error += msg + "\n" + msg = f'Package does not exist on maven: {package_url}' + error += msg + '\n' logger.error(msg) return db_package, error package = _parse( - "maven_pom", - "maven", - "Java", + 'maven_pom', + 'maven', + 'Java', text=pom_text, base_url=base_url, ) @@ -295,11 +295,11 @@ def map_maven_package(package_url, package_content): # url is not properly generated since it would be missing the sources bit # from the filename. package.qualifiers = package_url.qualifiers - package.download_url = urls["repository_download_url"] - package.repository_download_url = urls["repository_download_url"] + package.download_url = urls['repository_download_url'] + package.repository_download_url = urls['repository_download_url'] # Set package_content value - package.extra_data["package_content"] = package_content + package.extra_data['package_content'] = package_content # If sha1 exists for a jar, we know we can create the package # Use pom info as base and create packages for binary and source package @@ -310,8 +310,8 @@ def map_maven_package(package_url, package_content): package.sha1 = sha1 db_package, _, _, _ = merge_or_create_package(package, visit_level=50) else: - msg = f"Failed to retrieve JAR: {package_url}" - error += msg + "\n" + msg = f'Failed to retrieve JAR: {package_url}' + error += msg + '\n' logger.error(msg) # Submit package for scanning @@ -328,13 +328,13 @@ def map_maven_binary_and_source(package_url): Return an error string for errors that occur, or empty string if there is no error. """ - error = "" + error = '' package, emsg = map_maven_package(package_url, PackageContentType.BINARY) if emsg: error += emsg source_package_url = package_url - source_package_url.qualifiers["classifier"] = "sources" + source_package_url.qualifiers['classifier'] = 'sources' source_package, emsg = map_maven_package( source_package_url, PackageContentType.SOURCE_ARCHIVE ) @@ -359,21 +359,21 @@ def map_maven_packages(package_url): Return an error string for errors that occur, or empty string if there is no error. """ - error = "" + error = '' namespace = package_url.namespace name = package_url.name # Find all versions of this package - query_params = f"g:{namespace}+AND+a:{name}" - url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" + query_params = f'g:{namespace}+AND+a:{name}' + url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' response = requests.get(url) if response: - package_listings = response.json().get("response", {}).get("docs", []) + package_listings = response.json().get('response', {}).get('docs', []) for listing in package_listings: purl = PackageURL( - type="maven", - namespace=listing.get("g"), - name=listing.get("a"), - version=listing.get("v"), + type='maven', + namespace=listing.get('g'), + name=listing.get('a'), + version=listing.get('v'), ) emsg = map_maven_binary_and_source(purl) if emsg: @@ -421,7 +421,7 @@ def process_request(purl_str): try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f"error occured when parsing {purl_str}: {e}" + error = f'error occured when parsing {purl_str}: {e}' return error has_version = bool(package_url.version) @@ -450,14 +450,14 @@ def check_if_page_has_pom_files(links, **kwargs): """ Return True of any entry in `links` ends with .pom. """ - return any(l.endswith(".pom") for l in links) + return any(l.endswith('.pom') for l in links) def check_if_page_has_directories(links, **kwargs): """ Return True if any entry, excluding "../", ends with /. """ - return any(l.endswith("/") for l in links if l != "../") + return any(l.endswith('/') for l in links if l != '../') def check_if_package_version_page(links, **kwargs): @@ -471,7 +471,7 @@ def check_if_package_version_page(links, **kwargs): def check_if_package_page(links, **kwargs): return check_if_file_name_is_linked_on_page( - file_name="maven-metadata.xml", links=links + file_name='maven-metadata.xml', links=links ) and not check_if_page_has_pom_files(links=links) @@ -481,7 +481,7 @@ def check_if_maven_root(links, **kwargs): repo contains "archetype-catalog.xml". """ return check_if_file_name_is_linked_on_page( - file_name="archetype-catalog.xml", links=links + file_name='archetype-catalog.xml', links=links ) @@ -522,14 +522,14 @@ def url_parts(url): parsed_url = urlparse(url) scheme = parsed_url.scheme netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split("/") if p] + path_segments = [p for p in parsed_url.path.split('/') if p] return scheme, netloc, path_segments def create_url(scheme, netloc, path_segments): - url_template = f"{scheme}://{netloc}" - path = "/".join(path_segments) - return f"{url_template}/{path}" + url_template = f'{scheme}://{netloc}' + path = '/'.join(path_segments) + return f'{url_template}/{path}' def get_maven_root(url): @@ -564,27 +564,27 @@ def determine_namespace_name_version_from_url(url, root_url=None): if not root_url: root_url = get_maven_root(url) if not root_url: - raise Exception(f"Error: not a Maven repository: {url}") + raise Exception(f'Error: not a Maven repository: {url}') _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split("/") + remaining_path_segments = remaining_path_segments.split('/') remaining_path_segments = [p for p in remaining_path_segments if p] namespace_segments = [] - package_name = "" - package_version = "" + package_name = '' + package_version = '' for i in range(len(remaining_path_segments)): segment = remaining_path_segments[i] segments = remaining_path_segments[: i + 1] - path = "/".join(segments) - url_segment = f"{root_url}/{path}" + path = '/'.join(segments) + url_segment = f'{root_url}/{path}' if is_package_page(url_segment): package_name = segment elif is_package_version_page(url_segment): package_version = segment else: namespace_segments.append(segment) - namespace = ".".join(namespace_segments) + namespace = '.'.join(namespace_segments) return namespace, package_name, package_version @@ -600,13 +600,13 @@ def add_to_import_queue(url, root_url): data = response.text namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) purl = PackageURL( - type="maven", + type='maven', namespace=namespace, name=name, ) importable_uri = ImportableURI.objects.insert(url, data, purl) if importable_uri: - logger.info(f"Inserted {url} into ImportableURI queue") + logger.info(f'Inserted {url} into ImportableURI queue') def filter_only_directories(timestamps_by_links): @@ -615,27 +615,27 @@ def filter_only_directories(timestamps_by_links): """ timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): - if link != "../" and link.endswith("/"): + if link != '../' and link.endswith('/'): timestamps_by_links_filtered[link] = timestamp return timestamps_by_links_filtered valid_artifact_extensions = [ - "ejb3", - "ear", - "aar", - "apk", - "gem", - "jar", - "nar", + 'ejb3', + 'ear', + 'aar', + 'apk', + 'gem', + 'jar', + 'nar', # 'pom', - "so", - "swc", - "tar", - "tar.gz", - "war", - "xar", - "zip", + 'so', + 'swc', + 'tar', + 'tar.gz', + 'war', + 'xar', + 'zip', ] @@ -661,8 +661,8 @@ def collect_links_from_text(text, filter): links_and_timestamps = collect_links_and_artifact_timestamps(text) timestamps_by_links = {} for link, timestamp in links_and_timestamps: - if timestamp == "-": - timestamp = "" + if timestamp == '-': + timestamp = '' timestamps_by_links[link] = timestamp timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) @@ -675,11 +675,11 @@ def create_absolute_urls_for_links(text, url, filter): links from `url` and their timestamps, that is then filtered by `filter`. """ timestamps_by_absolute_links = {} - url = url.rstrip("/") + url = url.rstrip('/') timestamps_by_links = collect_links_from_text(text, filter) for link, timestamp in timestamps_by_links.items(): if not link.startswith(url): - link = f"{url}/{link}" + link = f'{url}/{link}' timestamps_by_absolute_links[link] = timestamp return timestamps_by_absolute_links @@ -735,7 +735,7 @@ def get_artifact_sha1(artifact_url): Return the SHA1 value of the Maven artifact located at `artifact_url`. """ sha1 = None - artifact_sha1_url = f"{artifact_url}.sha1" + artifact_sha1_url = f'{artifact_url}.sha1' response = requests.get(artifact_sha1_url) if response: sha1_contents = response.text.strip().split() @@ -753,18 +753,18 @@ def get_classifier_from_artifact_url( """ classifier = None # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip("/") + package_version_page_url = package_version_page_url.rstrip('/') # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" + leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' # ['', '-onejar.jar'] _, remaining_url_portion = artifact_url.split(leading_url_portion) # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split(".") + remaining_url_portions = remaining_url_portion.split('.') if remaining_url_portions and remaining_url_portions[0]: # '-onejar' classifier = remaining_url_portions[0] - if classifier.startswith("-"): + if classifier.startswith('-'): # 'onejar' classifier = classifier[1:] return classifier @@ -1546,19 +1546,19 @@ def _artifact_stats(location): print('Top packaging:') for n, c in pom_packs.most_common(): - print(n, ":", c) + print(n, ':', c) print('Top classifiers:') for n, c in pom_classifs.most_common(): - print(n, ":", c) + print(n, ':', c) print('Top extensions:') for n, c in pom_extensions.most_common(): - print(n, ":", c) + print(n, ':', c) print('Top Combos: packaging, classifier, extension') for n, c in combos.most_common(): - print(n, ":", c) + print(n, ':', c) """ Latest stats on 2017-08-07: From 767e8a995f6a4f99e3fdd8579dbabcd9066cda76 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 19 Mar 2024 10:02:08 -0700 Subject: [PATCH 31/31] Remove duplicate import #49 #285 Signed-off-by: Jono Yang --- minecode/visitors/npm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/minecode/visitors/npm.py b/minecode/visitors/npm.py index e45fe7e8..3fdceddf 100644 --- a/minecode/visitors/npm.py +++ b/minecode/visitors/npm.py @@ -13,7 +13,6 @@ from packagedcode.npm import npm_api_url from packagedcode.npm import split_scoped_package_name from packagedcode.npm import NpmPackageJsonHandler -from packagedcode.npm import npm_api_url from packageurl import PackageURL import requests