aboutcode-org · tdruez · Nov 15, 2021 · Oct 22, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,6 +4,9 @@ Changelog
 Unreleased
 ----------
 
+- Include layers in docker image data.
+  https://github.com/nexB/scancode.io/issues/175
+
 - Fix a server error on resource details view when the compliance alert is "missing".
   https://github.com/nexB/scancode.io/issues/344
 

diff --git a/scanpipe/pipes/docker.py b/scanpipe/pipes/docker.py
@@ -36,48 +36,93 @@
 
 def extract_images_from_inputs(project):
     """
-    Collects all the tarballs from the `project` input/ work directory, extracts each
-    tarball to the tmp/ work directory and collects the images.
-    Returns the `images` and `errors` that may have happen during the extraction.
+    Collects all the tarballs from the `project` input/ work directory, extracts
+    each tarball to the tmp/ work directory and collects the images.
+
+    Returns the `images` and an `errors` list of error messages that may have
+    happen during the extraction.
     """
     target_path = project.tmp_path
     images = []
     errors = []
 
     for input_tarball in project.inputs(pattern="*.tar*"):
         extract_target = target_path / f"{input_tarball.name}-extract"
-        extract_errors = extract_archive(input_tarball, extract_target)
-        images.extend(Image.get_images_from_dir(extract_target))
-        errors.extend(extract_errors)
+        imgs, errs = extract_image_from_tarball(input_tarball, extract_target)
+        images.extend(imgs)
+        errors.extend(errs)
 
     return images, errors
 
 
+def extract_image_from_tarball(input_tarball, extract_target, verify=True):
+    """
+    Extract images from an ``input_tarball`` to an ``extract_target`` directory
+    Path object and collects the extracted images.
+
+    Returns the `images` and an `errors` list of error messages that may have
+    happen during the extraction.
+    """
+    errors = list(extract_archive(location=input_tarball, target=extract_target))
+    images = Image.get_images_from_dir(
+        extracted_location=str(extract_target),
+        verify=verify,
+    )
+    return images, errors
+
+
 def extract_layers_from_images(project, images):
     """
-    Extracts all layers from the provided `images` into the `project` codebase/ work
+    Extracts all layers from the provided `images` into the `project` codebase
+    work directory.
+
+    Returns an `errors` list of error messages that may occur during the
+    extraction.
+    """
+    return extract_layers_from_images_to_base_path(
+        base_path=project.codebase_path,
+        images=images,
+    )
+
+
+def extract_layers_from_images_to_base_path(base_path, images):
+    """
+    Extracts all layers from the provided `images` into the `base_path` work
     directory.
-    Returns the `errors` that may happen during the extraction.
+
+    Returns an `errors` list of error messages that may occur during the
+    extraction.
     """
     errors = []
+    base_path = Path(base_path)
+
     for image in images:
         image_dirname = Path(image.extracted_location).name
-        target_path = project.codebase_path / image_dirname
+        target_path = base_path / image_dirname
 
         for layer in image.layers:
             extract_target = target_path / layer.layer_id
-            extract_errors = extract_archive(layer.archive_location, extract_target)
+            extract_errors = extract_archive(
+                location=layer.archive_location,
+                target=extract_target,
+            )
             errors.extend(extract_errors)
             layer.extracted_location = str(extract_target)
 
+    return errors
+
 
-def get_image_data(image):
+def get_image_data(image, layer_path_segments=2):
     """
     Returns a mapping of image-related data given an `image`.
+    Keep only ``layer_path_segments`` trailing layer location segments (or keep
+    the locations unmodified if ``layer_path_segments`` is 0).
     """
-    exclude = ["extracted_location", "archive_location", "layers"]
+    exclude_from_img = ["extracted_location", "archive_location"]
     image_data = {
-        key: value for key, value in image.to_dict().items() if key not in exclude
+        key: value
+        for key, value in image.to_dict(layer_path_segments=layer_path_segments).items()
+        if key not in exclude_from_img
     }
     return image_data
 

diff --git a/scanpipe/tests/data/docker-images.tar.gz b/scanpipe/tests/data/docker-images.tar.gz
diff --git a/scanpipe/tests/data/docker-images.tar.gz-expected-data-1.json b/scanpipe/tests/data/docker-images.tar.gz-expected-data-1.json
diff --git a/scanpipe/tests/data/docker-images.tar.gz-expected-data-2.json b/scanpipe/tests/data/docker-images.tar.gz-expected-data-2.json
diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py
@@ -40,7 +40,6 @@
 from scanpipe.models import DiscoveredPackage
 from scanpipe.models import Project
 from scanpipe.pipes import codebase
-from scanpipe.pipes import docker
 from scanpipe.pipes import fetch
 from scanpipe.pipes import filename_now
 from scanpipe.pipes import make_codebase_resource
@@ -744,17 +743,6 @@ def test_scanpipe_pipes_fetch_fetch_urls(self, mock_get):
         self.assertEqual(2, len(errors))
         self.assertEqual(urls, errors)
 
-    def test_scanpipe_pipes_docker_tag_whiteout_codebase_resources(self):
-        p1 = Project.objects.create(name="Analysis")
-        resource1 = CodebaseResource.objects.create(project=p1, path="filename.ext")
-        resource2 = CodebaseResource.objects.create(project=p1, name=".wh.filename2")
-
-        docker.tag_whiteout_codebase_resources(p1)
-        resource1.refresh_from_db()
-        resource2.refresh_from_db()
-        self.assertEqual("", resource1.status)
-        self.assertEqual("ignored-whiteout", resource2.status)
-
     def test_scanpipe_pipes_rootfs_from_project_codebase_class_method(self):
         p1 = Project.objects.create(name="Analysis")
         root_filesystems = list(rootfs.RootFs.from_project_codebase(p1))

diff --git a/scanpipe/tests/test_pipes_docker.py b/scanpipe/tests/test_pipes_docker.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+import json
+import tempfile
+from pathlib import Path
+
+from django.apps import apps
+from django.test import TestCase
+
+from scanpipe.models import CodebaseResource
+from scanpipe.models import Project
+from scanpipe.pipes import docker
+
+scanpipe_app = apps.get_app_config("scanpipe")
+
+
+class ScanPipeDockerPipesTest(TestCase):
+    data_path = Path(__file__).parent / "data"
+
+    def assertResultsEqual(self, expected_file, results, regen=False):
+        """
+        Set `regen` to True to regenerate the expected results.
+        """
+        if regen:
+            expected_file.write_text(results)
+
+        expected_data = expected_file.read_text()
+        self.assertEqual(expected_data, results)
+
+    def test_pipes_docker_get_image_data_contains_layers_with_relative_paths(self):
+        extract_target = str(Path(tempfile.mkdtemp()) / "tempdir")
+        input_tarball = str(self.data_path / "docker-images.tar.gz")
+
+        # Extract the image first
+        images, errors = docker.extract_image_from_tarball(
+            input_tarball,
+            extract_target,
+            verify=False,
+        )
+        self.assertEqual([], errors)
+
+        images_data = [docker.get_image_data(i) for i in images]
+        results = json.dumps(images_data, indent=2)
+        expected_location = self.data_path / "docker-images.tar.gz-expected-data-1.json"
+        self.assertResultsEqual(expected_location, results, regen=False)
+
+        # Extract the layers second
+        errors = docker.extract_layers_from_images_to_base_path(
+            base_path=extract_target,
+            images=images,
+        )
+        self.assertEqual([], errors)
+
+        images_data = [docker.get_image_data(i) for i in images]
+        results = json.dumps(images_data, indent=2)
+        expected_location = self.data_path / "docker-images.tar.gz-expected-data-2.json"
+        self.assertResultsEqual(expected_location, results, regen=False)
+
+    def test_pipes_docker_tag_whiteout_codebase_resources(self):
+        p1 = Project.objects.create(name="Analysis")
+        resource1 = CodebaseResource.objects.create(project=p1, path="filename.ext")
+        resource2 = CodebaseResource.objects.create(project=p1, name=".wh.filename2")
+
+        docker.tag_whiteout_codebase_resources(p1)
+        resource1.refresh_from_db()
+        resource2.refresh_from_db()
+        self.assertEqual("", resource1.status)
+        self.assertEqual("ignored-whiteout", resource2.status)
diff --git a/setup.cfg b/setup.cfg
@@ -70,7 +70,7 @@ install_requires =
     # WSGI server
     gunicorn==20.1.0
     # Docker
-    container_inspector==21.6.10
+    container_inspector==30.0.0
     # ScanCode-toolkit
     scancode-toolkit[packages]==30.1.0
     extractcode[full]==30.0.0