aboutcode-org · tdruez · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -17,6 +17,9 @@ v35.4.0 (unreleased)
 - Display the optional steps in the Pipelines autodoc.
   https://github.com/aboutcode-org/scancode.io/issues/1822
 
+- Add new ``benchmark_purls`` pipeline.
+  https://github.com/aboutcode-org/scancode.io/issues/1804
+
 v35.3.0 (2025-08-20)
 --------------------
 

diff --git a/docs/built-in-pipelines.rst b/docs/built-in-pipelines.rst
@@ -46,6 +46,47 @@ Analyse Docker Windows Image
     :members:
     :member-order: bysource
 
+.. _pipeline_benchmark_purls:
+
+Benchmark PURLs (addon)
+-----------------------
+
+To check an **SBOM against a list of expected Package URLs (PURLs)**:
+
+1. **Create a new project** and provide two inputs:
+
+   * The SBOM file you want to check.
+   * A list of expected PURLs in a ``*-purls.txt`` file with one PURL per line.
+
+     .. tip:: You may also flag any filename using the ``purls`` input tag.
+
+2. **Run the pipelines**:
+
+   * Select and run the ``load_sbom`` pipeline to load the SBOM.
+   * Run the ``benchmark_purls`` pipeline to validate against the expected PURLs.
+
+3. **Download the results** from the "output" section of the project.
+
+The output file contains only the differences between the discovered PURLs and
+the expected PURLs:
+
+* Lines starting with ``-`` are missing from the project.
+* Lines starting with ``+`` are unexpected in the project.
+
+.. note::
+  The ``load_sbom`` pipeline is provided as an example to benchmark external
+  tools using SBOMs as inputs. You can also run ``benchmark_purls`` directly
+  after any ScanCode.io pipeline to validate the discovered PURLs.
+
+.. tip::
+  You can provide multiple expected PURLs files.
+
+
+.. autoclass:: scanpipe.pipelines.benchmark_purls.BenchmarkPurls()
+    :members:
+    :member-order: bysource
+
+
 .. _pipeline_collect_strings_gettext:
 
 Collect string with Xgettext (addon)

diff --git a/docs/scanpipe-pipes.rst b/docs/scanpipe-pipes.rst
@@ -8,6 +8,11 @@ Generic
 .. automodule:: scanpipe.pipes
     :members:
 
+Benchmark
+---------
+.. automodule:: scanpipe.pipes.benchmark
+    :members:
+
 ClamAV
 ------
 .. automodule:: scanpipe.pipes.clamav

diff --git a/pyproject.toml b/pyproject.toml
@@ -135,6 +135,7 @@ run = "scancodeio:combined_run"
 analyze_docker_image = "scanpipe.pipelines.analyze_docker:Docker"
 analyze_root_filesystem_or_vm_image = "scanpipe.pipelines.analyze_root_filesystem:RootFS"
 analyze_windows_docker_image = "scanpipe.pipelines.analyze_docker_windows:DockerWindows"
+benchmark_purls = "scanpipe.pipelines.benchmark_purls:BenchmarkPurls"
 collect_strings_gettext = "scanpipe.pipelines.collect_strings_gettext:CollectStringsGettext"
 collect_symbols_ctags = "scanpipe.pipelines.collect_symbols_ctags:CollectSymbolsCtags"
 collect_symbols_pygments = "scanpipe.pipelines.collect_symbols_pygments:CollectSymbolsPygments"

diff --git a/scanpipe/models.py b/scanpipe/models.py
@@ -1147,12 +1147,12 @@ def get_output_file_path(self, name, extension):
         filename = f"{name}-{filename_now()}.{extension}"
         return self.output_path / filename
 
-    def get_latest_output(self, filename):
+    def get_latest_output(self, filename, extension="json"):
         """
         Return the latest output file with the "filename" prefix, for example
         "scancode-<timestamp>.json".
         """
-        output_files = sorted(self.output_path.glob(f"*{filename}*.json"))
+        output_files = sorted(self.output_path.glob(f"*{filename}*.{extension}"))
         if output_files:
             return output_files[-1]
 

diff --git a/scanpipe/pipelines/benchmark_purls.py b/scanpipe/pipelines/benchmark_purls.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes import benchmark
+
+
+class BenchmarkPurls(Pipeline):
+    """
+    Validate discovered project packages against a reference list of expected PURLs.
+
+    The expected PURLs must be provided as a .txt file with one PURL per line.
+    Input files are recognized if:
+
+    - They are tagged with "purls", or
+    - Their filename ends with "purls.txt" (e.g., "expected_purls.txt").
+
+    """
+
+    download_inputs = False
+    is_addon = True
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.get_expected_purls,
+            cls.compare_purls,
+        )
+
+    def get_expected_purls(self):
+        """Load the expected PURLs defined in the project inputs."""
+        self.expected_purls = benchmark.get_expected_purls(self.project)
+
+    def compare_purls(self):
+        """Run the PURLs diff and write the results to a project output file."""
+        diff_results = benchmark.compare_purls(self.project, self.expected_purls)
+        output_file = self.project.get_output_file_path("benchmark_purls", "txt")
+        output_file.write_text("\n".join(diff_results))
diff --git a/scanpipe/pipes/benchmark.py b/scanpipe/pipes/benchmark.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+import difflib
+
+
+def get_expected_purls(project):
+    """
+    Load the expected Package URLs (PURLs) from the project's input files.
+
+    A file is considered an expected PURLs source if:
+    - Its filename ends with ``*purls.txt``, or
+    - Its download URL includes the "#purls" tag.
+
+    Each line in the file should contain one PURL. Returns a sorted,
+    deduplicated list of PURLs. Raises an exception if no input is found.
+    """
+    purls_files = list(project.inputs("*purls.txt"))
+    purls_files.extend(
+        [input.path for input in project.inputsources.filter(tag="purls")]
+    )
+
+    expected_purls = []
+    for file_path in purls_files:
+        expected_purls.extend(file_path.read_text().splitlines())
+
+    if not expected_purls:
+        raise Exception("Expected PURLs not provided.")
+
+    return sorted(set(expected_purls))
+
+
+def compare_purls(project, expected_purls):
+    """
+    Compare discovered project PURLs against the expected PURLs.
+
+    Returns only the differences:
+    - Lines starting with '-' are missing from the project.
+    - Lines starting with '+' are unexpected in the project.
+    """
+    project_packages = project.discoveredpackages.only_package_url_fields()
+    sorted_unique_purls = sorted({package.purl for package in project_packages})
+
+    diff_result = difflib.ndiff(sorted_unique_purls, expected_purls)
+
+    # Keep only lines that are diffs (- or +)
+    filtered_diff = [line for line in diff_result if line.startswith(("-", "+"))]
+
+    return filtered_diff
diff --git a/scanpipe/tests/data/benchmark/alpine-3.22.1-expected-benchmark.txt b/scanpipe/tests/data/benchmark/alpine-3.22.1-expected-benchmark.txt
@@ -0,0 +1,3 @@
+- pkg:alpine/alpine-keys@2.5-r0?arch=x86_64
++ pkg:alpine/zlib@1.3.2-r2?arch=x86_64
++ pkg:deb/debian/alpine-keys@2.5-r0?arch=x86_64
diff --git a/scanpipe/tests/data/benchmark/alpine-3.22.1-expected-purls.txt b/scanpipe/tests/data/benchmark/alpine-3.22.1-expected-purls.txt
@@ -0,0 +1,17 @@
+pkg:alpine/alpine-baselayout@3.7.0-r0?arch=x86_64
+pkg:alpine/alpine-baselayout-data@3.7.0-r0?arch=x86_64
+pkg:deb/debian/alpine-keys@2.5-r0?arch=x86_64
+pkg:alpine/alpine-release@3.22.1-r0?arch=x86_64
+pkg:alpine/apk-tools@2.14.9-r2?arch=x86_64
+pkg:alpine/busybox@1.37.0-r18?arch=x86_64
+pkg:alpine/busybox-binsh@1.37.0-r18?arch=x86_64
+pkg:alpine/ca-certificates-bundle@20250619-r0?arch=x86_64
+pkg:alpine/libapk2@2.14.9-r2?arch=x86_64
+pkg:alpine/libcrypto3@3.5.1-r0?arch=x86_64
+pkg:alpine/libssl3@3.5.1-r0?arch=x86_64
+pkg:alpine/musl@1.2.5-r10?arch=x86_64
+pkg:alpine/musl-utils@1.2.5-r10?arch=x86_64
+pkg:alpine/scanelf@1.3.8-r1?arch=x86_64
+pkg:alpine/ssl_client@1.37.0-r18?arch=x86_64
+pkg:alpine/zlib@1.3.1-r2?arch=x86_64
+pkg:alpine/zlib@1.3.2-r2?arch=x86_64