From 61e6e44b7d37e1031b2462fc56dffa106e27b642 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 4 Dec 2025 14:54:18 +0530 Subject: [PATCH 1/2] Refactor cargo mining pipeline for git deployment Resolves: https://github.com/aboutcode-org/purldb/issues/770 Signed-off-by: Keshav Priyadarshi --- minecode_pipelines/miners/cargo.py | 89 ---------------------- minecode_pipelines/pipelines/mine_cargo.py | 61 +++++++-------- minecode_pipelines/pipes/cargo.py | 62 ++++++++++++--- pyproject-minecode_pipelines.toml | 2 +- 4 files changed, 78 insertions(+), 136 deletions(-) delete mode 100644 minecode_pipelines/miners/cargo.py diff --git a/minecode_pipelines/miners/cargo.py b/minecode_pipelines/miners/cargo.py deleted file mode 100644 index 2760aff0..00000000 --- a/minecode_pipelines/miners/cargo.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from minecode_pipelines.pipes.cargo import store_cargo_packages -from scanpipe.pipes.federatedcode import commit_and_push_changes -import json -from pathlib import Path -from django.conf import settings -from scancodeio import VERSION -from aboutcode.pipeline import LoopProgress - - -def cargo_commit_message(commit_batch, total_commit_batch="many"): - author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME - author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL - tool_name = "pkg:github/aboutcode-org/scancode.io" - - return f"""\ - Collect PackageURLs from crates.io index ({commit_batch}/{total_commit_batch}) - - Tool: {tool_name}@v{VERSION} - Reference: https://{settings.ALLOWED_HOSTS[0]} - - Signed-off-by: {author_name} <{author_email}> - """ - - -def process_cargo_packages(cargo_index_repo, cloned_data_repo, logger): - """Mine and publish Cargo PackageURLs from Crates.io package index.""" - - base_path = Path(cargo_index_repo.working_tree_dir) - batch_size = 4000 - file_counter = 0 - purl_files = [] - commit_count = 1 - - package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")] - package_paths = [f for dir in package_dir for f in dir.rglob("*") if f.is_file()] - package_count = len(package_paths) - - progress = LoopProgress( - total_iterations=package_count, - logger=logger, - ) - - logger(f"Mine PackageURL for {package_count:,d} Cargo packages.") - for path in progress.iter(package_paths): - packages = [] - - with open(path, encoding="utf-8") as f: - for line_number, line in enumerate(f, start=1): - line = line.strip() - if not line: - continue - try: - packages.append(json.loads(line)) - except json.JSONDecodeError as e: - logger(f"Skipping invalid JSON in {path} at line {line_number}: {e}") - - file_counter += 1 - result = store_cargo_packages(packages, cloned_data_repo) - if result: - purl_file, _ = result - purl_files.append(purl_file) - - if file_counter % batch_size == 0 and purl_files: - if commit_and_push_changes( - repo=cloned_data_repo, - files_to_commit=purl_files, - commit_message=cargo_commit_message(commit_count), - logger=logger, - ): - commit_count += 1 - purl_files.clear() - - commit_and_push_changes( - repo=cloned_data_repo, - files_to_commit=purl_files, - commit_message=cargo_commit_message(commit_count, commit_count), - logger=logger, - ) - logger(f"Processed PackageURL for {file_counter:,d} Cargo packages.") - logger(f"Pushed new PackageURL in {commit_count:,d} commits.") diff --git a/minecode_pipelines/pipelines/mine_cargo.py b/minecode_pipelines/pipelines/mine_cargo.py index eed7b793..4c93a08e 100644 --- a/minecode_pipelines/pipelines/mine_cargo.py +++ b/minecode_pipelines/pipelines/mine_cargo.py @@ -20,53 +20,44 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -import os -from scanpipe.pipelines import Pipeline -from scanpipe.pipes import federatedcode -from minecode_pipelines.miners import cargo -from minecode_pipelines import pipes +from pathlib import Path -MINECODE_DATA_CARGO_REPO = os.environ.get( - "MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test" -) -MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index" +from minecode_pipelines.pipes import cargo +from minecode_pipelines.pipelines import MineCodeBasePipeline +from scanpipe.pipes import federatedcode -class MineCargo(Pipeline): +class MineCargo(MineCodeBasePipeline): """Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode.""" + MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index" + @classmethod def steps(cls): return ( cls.check_federatedcode_eligibility, - cls.clone_cargo_repos, - cls.mine_and_publish_cargo_packageurls, - cls.delete_cloned_repos, + cls.create_federatedcode_working_dir, + cls.clone_cargo_index, + cls.mine_and_publish_packageurls, + cls.delete_working_dir, ) - def check_federatedcode_eligibility(self): - """ - Check if the project fulfills the following criteria for - pushing the project result to FederatedCode. - """ - federatedcode.check_federatedcode_configured_and_available(logger=self.log) - - def clone_cargo_repos(self): - """ - Clone the Cargo-related repositories (index, data, and pipelines config) - and store their Repo objects in the corresponding instance variables. - """ - self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO) - self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO) - - self.log(f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}") - self.log(f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}") + def clone_cargo_index(self): + """Clone the Cargo index Repo.""" + self.cargo_index_repo = federatedcode.clone_repository( + repo_url=self.MINECODE_CARGO_INDEX_REPO, + clone_path=self.working_path / "crates.io-index", + logger=self.log, + ) - def mine_and_publish_cargo_packageurls(self): - cargo.process_cargo_packages(self.cargo_index_repo, self.cloned_data_repo, self.log) + def packages_count(self): + base_path = Path(self.cargo_index_repo.working_tree_dir) + package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")] + return sum(1 for dir in package_dir for f in dir.rglob("*") if f.is_file()) - def delete_cloned_repos(self): - pipes.delete_cloned_repos( - repos=[self.cargo_index_repo, self.cloned_data_repo], + def mine_packageurls(self): + """Yield PackageURLs from Cargo index.""" + return cargo.mine_cargo_packageurls( + cargo_index_repo=self.cargo_index_repo, logger=self.log, ) diff --git a/minecode_pipelines/pipes/cargo.py b/minecode_pipelines/pipes/cargo.py index 11e89a4a..c972bf37 100644 --- a/minecode_pipelines/pipes/cargo.py +++ b/minecode_pipelines/pipes/cargo.py @@ -1,14 +1,34 @@ -from pathlib import Path +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. -from aboutcode import hashid -from packageurl import PackageURL -from aboutcode.hashid import get_core_purl -from minecode_pipelines.pipes import write_data_to_yaml_file +import json +from pathlib import Path +from aboutcode.hashid import get_core_purl +from packageurl import PackageURL -def store_cargo_packages(packages, repo): - """Collect Cargo package versions into purls and write them to the repo.""" +def get_cargo_packages(packages): + """Return base_purl and list of PackageURLs from cargo packages.""" if not packages: return @@ -26,7 +46,27 @@ def store_cargo_packages(packages, repo): purl = PackageURL(type="cargo", name=name, version=version).to_string() updated_purls.append(purl) - ppath = hashid.get_package_purls_yml_file_path(base_purl) - purl_file_full_path = Path(repo.working_dir) / ppath - write_data_to_yaml_file(path=purl_file_full_path, data=sorted(updated_purls)) - return purl_file_full_path, base_purl + return base_purl, updated_purls + + +def mine_cargo_packageurls(cargo_index_repo, logger): + """Mine Cargo PackageURLs from Crates.io package index.""" + + base_path = Path(cargo_index_repo.working_tree_dir) + package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")] + package_paths = [f for dir in package_dir for f in dir.rglob("*") if f.is_file()] + + for path in package_paths: + packages = [] + + with open(path, encoding="utf-8") as f: + for line_number, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + packages.append(json.loads(line)) + except json.JSONDecodeError as e: + logger(f"Skipping invalid JSON in {path} at line {line_number}: {e}") + + yield get_cargo_packages(packages) diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 1d67d2f4..a7126b34 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "minecode_pipelines" -version = "0.0.1b26" +version = "0.0.1b27" description = "A library for mining packageURLs and package metadata from ecosystem repositories." readme = "minecode_pipelines/README.rst" license = { text = "Apache-2.0" } From fa0ffce77cc388141b20d3f5ba7c6c236f5e241f Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 4 Dec 2025 15:34:08 +0530 Subject: [PATCH 2/2] Update cargo tests Signed-off-by: Keshav Priyadarshi --- minecode_pipelines/pipelines/__init__.py | 3 + minecode_pipelines/tests/pipes/test_cargo.py | 62 +++----------------- 2 files changed, 11 insertions(+), 54 deletions(-) diff --git a/minecode_pipelines/pipelines/__init__.py b/minecode_pipelines/pipelines/__init__.py index 0ebd311c..db5cb6b9 100644 --- a/minecode_pipelines/pipelines/__init__.py +++ b/minecode_pipelines/pipelines/__init__.py @@ -91,6 +91,9 @@ def mine_and_publish_packageurls(self): self.log(f"Mine PackageURL for {package_count:,d} packages.") for base, purls in progress.iter(self.mine_packageurls()): + if not purls or not base: + continue + package_repo, datafile_path = data_cluster.get_datafile_repo_and_path(purl=base) if package_repo not in checked_out_repos: diff --git a/minecode_pipelines/tests/pipes/test_cargo.py b/minecode_pipelines/tests/pipes/test_cargo.py index 742f428a..76df7d7d 100644 --- a/minecode_pipelines/tests/pipes/test_cargo.py +++ b/minecode_pipelines/tests/pipes/test_cargo.py @@ -8,22 +8,19 @@ # import json -import tempfile + from pathlib import Path -from unittest import mock -from unittest.mock import Mock, patch + import saneyaml -from django.test import TestCase +from unittest import TestCase -from minecode_pipelines.pipes import write_data_to_yaml_file -from minecode_pipelines.pipes.cargo import store_cargo_packages +from minecode_pipelines.pipes.cargo import get_cargo_packages DATA_DIR = Path(__file__).parent.parent / "test_data" / "cargo" class CargoPipelineTests(TestCase): - @patch("minecode_pipelines.pipes.cargo.write_data_to_yaml_file") - def test_collect_packages_from_cargo_calls_write(self, mock_write): + def test_collect_packages_from_cargo_calls_write(self): packages_file = DATA_DIR / "c5store" expected_file = DATA_DIR / "c5store-expected.yaml" @@ -36,50 +33,7 @@ def test_collect_packages_from_cargo_calls_write(self, mock_write): with open(expected_file, encoding="utf-8") as f: expected = saneyaml.load(f) - with tempfile.TemporaryDirectory() as tmpdir: - repo = Mock() - repo.working_dir = tmpdir - - store_cargo_packages(packages, repo) - - mock_write.assert_called_once() - args, kwargs = mock_write.call_args - base_purl, written_packages = kwargs["path"], kwargs["data"] - - expected_base_purl = ( - Path(tmpdir) / "aboutcode-packages-cargo-0" / "cargo" / "c5store" / "purls.yml" - ) - - self.assertEqual(str(base_purl), str(expected_base_purl)) - self.assertEqual(written_packages, expected) - - def _assert_purls_written(self, purls): - with tempfile.TemporaryDirectory() as tmpdir: - repo_dir = Path(tmpdir) - - mock_repo = mock.MagicMock() - mock_repo.working_dir = str(repo_dir) - mock_repo.index.add = mock.MagicMock() - - purls_file = repo_dir / "purls.yaml" - - write_data_to_yaml_file(purls_file, purls) - - self.assertTrue(purls_file.exists()) - - with open(purls_file, encoding="utf-8") as f: - content = saneyaml.load(f) - - self.assertEqual(content, purls) - - def test_add_purl_result_with_mock_repo(self): - self._assert_purls_written( - [{"purl": "pkg:pypi/django@4.2.0"}, {"purl": "pkg:pypi/django@4.3.0"}] - ) - - def test_add_empty_purl_result_with_mock_repo(self): - self._assert_purls_written([]) + base, purls = get_cargo_packages(packages) - def test_add_invalid_purl_with_mock_repo(self): - # invalid but still written as empty file - self._assert_purls_written([{"purl": "pkg:pypi/django"}]) + self.assertEqual(str(base), "pkg:cargo/c5store") + self.assertEqual(purls, expected)