Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ v35.4.0 (unreleased)
- Display the optional steps in the Pipelines autodoc.
https://github.com/aboutcode-org/scancode.io/issues/1822

- Add new ``benchmark_purls`` pipeline.
https://github.com/aboutcode-org/scancode.io/issues/1804

v35.3.0 (2025-08-20)
--------------------

Expand Down
41 changes: 41 additions & 0 deletions docs/built-in-pipelines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,47 @@ Analyse Docker Windows Image
:members:
:member-order: bysource

.. _pipeline_benchmark_purls:

Benchmark PURLs (addon)
-----------------------

To check an **SBOM against a list of expected Package URLs (PURLs)**:

1. **Create a new project** and provide two inputs:

* The SBOM file you want to check.
* A list of expected PURLs in a ``*-purls.txt`` file with one PURL per line.

.. tip:: You may also flag any filename using the ``purls`` input tag.

2. **Run the pipelines**:

* Select and run the ``load_sbom`` pipeline to load the SBOM.
* Run the ``benchmark_purls`` pipeline to validate against the expected PURLs.

3. **Download the results** from the "output" section of the project.

The output file contains only the differences between the discovered PURLs and
the expected PURLs:

* Lines starting with ``-`` are missing from the project.
* Lines starting with ``+`` are unexpected in the project.

.. note::
The ``load_sbom`` pipeline is provided as an example to benchmark external
tools using SBOMs as inputs. You can also run ``benchmark_purls`` directly
after any ScanCode.io pipeline to validate the discovered PURLs.

.. tip::
You can provide multiple expected PURLs files.


.. autoclass:: scanpipe.pipelines.benchmark_purls.BenchmarkPurls()
:members:
:member-order: bysource


.. _pipeline_collect_strings_gettext:

Collect string with Xgettext (addon)
Expand Down
5 changes: 5 additions & 0 deletions docs/scanpipe-pipes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Generic
.. automodule:: scanpipe.pipes
:members:

Benchmark
---------
.. automodule:: scanpipe.pipes.benchmark
:members:

ClamAV
------
.. automodule:: scanpipe.pipes.clamav
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ run = "scancodeio:combined_run"
analyze_docker_image = "scanpipe.pipelines.analyze_docker:Docker"
analyze_root_filesystem_or_vm_image = "scanpipe.pipelines.analyze_root_filesystem:RootFS"
analyze_windows_docker_image = "scanpipe.pipelines.analyze_docker_windows:DockerWindows"
benchmark_purls = "scanpipe.pipelines.benchmark_purls:BenchmarkPurls"
collect_strings_gettext = "scanpipe.pipelines.collect_strings_gettext:CollectStringsGettext"
collect_symbols_ctags = "scanpipe.pipelines.collect_symbols_ctags:CollectSymbolsCtags"
collect_symbols_pygments = "scanpipe.pipelines.collect_symbols_pygments:CollectSymbolsPygments"
Expand Down
4 changes: 2 additions & 2 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1147,12 +1147,12 @@ def get_output_file_path(self, name, extension):
filename = f"{name}-{filename_now()}.{extension}"
return self.output_path / filename

def get_latest_output(self, filename):
def get_latest_output(self, filename, extension="json"):
"""
Return the latest output file with the "filename" prefix, for example
"scancode-<timestamp>.json".
"""
output_files = sorted(self.output_path.glob(f"*{filename}*.json"))
output_files = sorted(self.output_path.glob(f"*{filename}*.{extension}"))
if output_files:
return output_files[-1]

Expand Down
57 changes: 57 additions & 0 deletions scanpipe/pipelines/benchmark_purls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import benchmark


class BenchmarkPurls(Pipeline):
"""
Validate discovered project packages against a reference list of expected PURLs.

The expected PURLs must be provided as a .txt file with one PURL per line.
Input files are recognized if:

- They are tagged with "purls", or
- Their filename ends with "purls.txt" (e.g., "expected_purls.txt").

"""

download_inputs = False
is_addon = True

@classmethod
def steps(cls):
return (
cls.get_expected_purls,
cls.compare_purls,
)

def get_expected_purls(self):
"""Load the expected PURLs defined in the project inputs."""
self.expected_purls = benchmark.get_expected_purls(self.project)

def compare_purls(self):
"""Run the PURLs diff and write the results to a project output file."""
diff_results = benchmark.compare_purls(self.project, self.expected_purls)
output_file = self.project.get_output_file_path("benchmark_purls", "txt")
output_file.write_text("\n".join(diff_results))
68 changes: 68 additions & 0 deletions scanpipe/pipes/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import difflib


def get_expected_purls(project):
"""
Load the expected Package URLs (PURLs) from the project's input files.

A file is considered an expected PURLs source if:
- Its filename ends with ``*purls.txt``, or
- Its download URL includes the "#purls" tag.

Each line in the file should contain one PURL. Returns a sorted,
deduplicated list of PURLs. Raises an exception if no input is found.
"""
purls_files = list(project.inputs("*purls.txt"))
purls_files.extend(
[input.path for input in project.inputsources.filter(tag="purls")]
)

expected_purls = []
for file_path in purls_files:
expected_purls.extend(file_path.read_text().splitlines())

if not expected_purls:
raise Exception("Expected PURLs not provided.")

return sorted(set(expected_purls))


def compare_purls(project, expected_purls):
"""
Compare discovered project PURLs against the expected PURLs.

Returns only the differences:
- Lines starting with '-' are missing from the project.
- Lines starting with '+' are unexpected in the project.
"""
project_packages = project.discoveredpackages.only_package_url_fields()
sorted_unique_purls = sorted({package.purl for package in project_packages})

diff_result = difflib.ndiff(sorted_unique_purls, expected_purls)

# Keep only lines that are diffs (- or +)
filtered_diff = [line for line in diff_result if line.startswith(("-", "+"))]

return filtered_diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- pkg:alpine/alpine-keys@2.5-r0?arch=x86_64
+ pkg:alpine/zlib@1.3.2-r2?arch=x86_64
+ pkg:deb/debian/alpine-keys@2.5-r0?arch=x86_64
17 changes: 17 additions & 0 deletions scanpipe/tests/data/benchmark/alpine-3.22.1-expected-purls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pkg:alpine/alpine-baselayout@3.7.0-r0?arch=x86_64
pkg:alpine/alpine-baselayout-data@3.7.0-r0?arch=x86_64
pkg:deb/debian/alpine-keys@2.5-r0?arch=x86_64
pkg:alpine/alpine-release@3.22.1-r0?arch=x86_64
pkg:alpine/apk-tools@2.14.9-r2?arch=x86_64
pkg:alpine/busybox@1.37.0-r18?arch=x86_64
pkg:alpine/busybox-binsh@1.37.0-r18?arch=x86_64
pkg:alpine/ca-certificates-bundle@20250619-r0?arch=x86_64
pkg:alpine/libapk2@2.14.9-r2?arch=x86_64
pkg:alpine/libcrypto3@3.5.1-r0?arch=x86_64
pkg:alpine/libssl3@3.5.1-r0?arch=x86_64
pkg:alpine/musl@1.2.5-r10?arch=x86_64
pkg:alpine/musl-utils@1.2.5-r10?arch=x86_64
pkg:alpine/scanelf@1.3.8-r1?arch=x86_64
pkg:alpine/ssl_client@1.37.0-r18?arch=x86_64
pkg:alpine/zlib@1.3.1-r2?arch=x86_64
pkg:alpine/zlib@1.3.2-r2?arch=x86_64
Loading