Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 0 additions & 89 deletions minecode_pipelines/miners/cargo.py

This file was deleted.

3 changes: 3 additions & 0 deletions minecode_pipelines/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def mine_and_publish_packageurls(self):

self.log(f"Mine PackageURL for {package_count:,d} packages.")
for base, purls in progress.iter(self.mine_packageurls()):
if not purls or not base:
continue

package_repo, datafile_path = data_cluster.get_datafile_repo_and_path(purl=base)

if package_repo not in checked_out_repos:
Expand Down
61 changes: 26 additions & 35 deletions minecode_pipelines/pipelines/mine_cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,53 +20,44 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode
from minecode_pipelines.miners import cargo
from minecode_pipelines import pipes
from pathlib import Path

MINECODE_DATA_CARGO_REPO = os.environ.get(
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
)
MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"
from minecode_pipelines.pipes import cargo
from minecode_pipelines.pipelines import MineCodeBasePipeline
from scanpipe.pipes import federatedcode


class MineCargo(Pipeline):
class MineCargo(MineCodeBasePipeline):
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""

MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.clone_cargo_repos,
cls.mine_and_publish_cargo_packageurls,
cls.delete_cloned_repos,
cls.create_federatedcode_working_dir,
cls.clone_cargo_index,
cls.mine_and_publish_packageurls,
cls.delete_working_dir,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def clone_cargo_repos(self):
"""
Clone the Cargo-related repositories (index, data, and pipelines config)
and store their Repo objects in the corresponding instance variables.
"""
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)

self.log(f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}")
self.log(f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}")
def clone_cargo_index(self):
"""Clone the Cargo index Repo."""
self.cargo_index_repo = federatedcode.clone_repository(
repo_url=self.MINECODE_CARGO_INDEX_REPO,
clone_path=self.working_path / "crates.io-index",
logger=self.log,
)

def mine_and_publish_cargo_packageurls(self):
cargo.process_cargo_packages(self.cargo_index_repo, self.cloned_data_repo, self.log)
def packages_count(self):
base_path = Path(self.cargo_index_repo.working_tree_dir)
package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")]
return sum(1 for dir in package_dir for f in dir.rglob("*") if f.is_file())

def delete_cloned_repos(self):
pipes.delete_cloned_repos(
repos=[self.cargo_index_repo, self.cloned_data_repo],
def mine_packageurls(self):
"""Yield PackageURLs from Cargo index."""
return cargo.mine_cargo_packageurls(
cargo_index_repo=self.cargo_index_repo,
logger=self.log,
)
62 changes: 51 additions & 11 deletions minecode_pipelines/pipes/cargo.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
from pathlib import Path
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from aboutcode import hashid
from packageurl import PackageURL
from aboutcode.hashid import get_core_purl

from minecode_pipelines.pipes import write_data_to_yaml_file
import json
from pathlib import Path
from aboutcode.hashid import get_core_purl
from packageurl import PackageURL


def store_cargo_packages(packages, repo):
"""Collect Cargo package versions into purls and write them to the repo."""
def get_cargo_packages(packages):
"""Return base_purl and list of PackageURLs from cargo packages."""

if not packages:
return
Expand All @@ -26,7 +46,27 @@ def store_cargo_packages(packages, repo):
purl = PackageURL(type="cargo", name=name, version=version).to_string()
updated_purls.append(purl)

ppath = hashid.get_package_purls_yml_file_path(base_purl)
purl_file_full_path = Path(repo.working_dir) / ppath
write_data_to_yaml_file(path=purl_file_full_path, data=sorted(updated_purls))
return purl_file_full_path, base_purl
return base_purl, updated_purls


def mine_cargo_packageurls(cargo_index_repo, logger):
"""Mine Cargo PackageURLs from Crates.io package index."""

base_path = Path(cargo_index_repo.working_tree_dir)
package_dir = [p for p in base_path.iterdir() if p.is_dir() and not p.name.startswith(".")]
package_paths = [f for dir in package_dir for f in dir.rglob("*") if f.is_file()]

for path in package_paths:
packages = []

with open(path, encoding="utf-8") as f:
for line_number, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
packages.append(json.loads(line))
except json.JSONDecodeError as e:
logger(f"Skipping invalid JSON in {path} at line {line_number}: {e}")

yield get_cargo_packages(packages)
62 changes: 8 additions & 54 deletions minecode_pipelines/tests/pipes/test_cargo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,19 @@
#

import json
import tempfile

from pathlib import Path
from unittest import mock
from unittest.mock import Mock, patch

import saneyaml
from django.test import TestCase
from unittest import TestCase

from minecode_pipelines.pipes import write_data_to_yaml_file
from minecode_pipelines.pipes.cargo import store_cargo_packages
from minecode_pipelines.pipes.cargo import get_cargo_packages

DATA_DIR = Path(__file__).parent.parent / "test_data" / "cargo"


class CargoPipelineTests(TestCase):
@patch("minecode_pipelines.pipes.cargo.write_data_to_yaml_file")
def test_collect_packages_from_cargo_calls_write(self, mock_write):
def test_collect_packages_from_cargo_calls_write(self):
packages_file = DATA_DIR / "c5store"
expected_file = DATA_DIR / "c5store-expected.yaml"

Expand All @@ -36,50 +33,7 @@ def test_collect_packages_from_cargo_calls_write(self, mock_write):
with open(expected_file, encoding="utf-8") as f:
expected = saneyaml.load(f)

with tempfile.TemporaryDirectory() as tmpdir:
repo = Mock()
repo.working_dir = tmpdir

store_cargo_packages(packages, repo)

mock_write.assert_called_once()
args, kwargs = mock_write.call_args
base_purl, written_packages = kwargs["path"], kwargs["data"]

expected_base_purl = (
Path(tmpdir) / "aboutcode-packages-cargo-0" / "cargo" / "c5store" / "purls.yml"
)

self.assertEqual(str(base_purl), str(expected_base_purl))
self.assertEqual(written_packages, expected)

def _assert_purls_written(self, purls):
with tempfile.TemporaryDirectory() as tmpdir:
repo_dir = Path(tmpdir)

mock_repo = mock.MagicMock()
mock_repo.working_dir = str(repo_dir)
mock_repo.index.add = mock.MagicMock()

purls_file = repo_dir / "purls.yaml"

write_data_to_yaml_file(purls_file, purls)

self.assertTrue(purls_file.exists())

with open(purls_file, encoding="utf-8") as f:
content = saneyaml.load(f)

self.assertEqual(content, purls)

def test_add_purl_result_with_mock_repo(self):
self._assert_purls_written(
[{"purl": "pkg:pypi/django@4.2.0"}, {"purl": "pkg:pypi/django@4.3.0"}]
)

def test_add_empty_purl_result_with_mock_repo(self):
self._assert_purls_written([])
base, purls = get_cargo_packages(packages)

def test_add_invalid_purl_with_mock_repo(self):
# invalid but still written as empty file
self._assert_purls_written([{"purl": "pkg:pypi/django"}])
self.assertEqual(str(base), "pkg:cargo/c5store")
self.assertEqual(purls, expected)
2 changes: 1 addition & 1 deletion pyproject-minecode_pipelines.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flot.buildapi"

[project]
name = "minecode_pipelines"
version = "0.0.1b26"
version = "0.0.1b27"
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
readme = "minecode_pipelines/README.rst"
license = { text = "Apache-2.0" }
Expand Down
Loading