aspect-build · thesayyn · Oct 4, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024
diff --git a/MODULE.bazel b/MODULE.bazel
@@ -8,7 +8,7 @@ module(
 
 # Lower-bound versions of direct dependencies.
 # When bumping, add a comment explaining what's required from the newer release.
-bazel_dep(name = "aspect_bazel_lib", version = "1.40.0")
+bazel_dep(name = "aspect_bazel_lib", version = "2.9.1") # py_image_layer requires 2.x for the `tar` rule.
 bazel_dep(name = "bazel_skylib", version = "1.4.2")
 bazel_dep(name = "rules_python", version = "0.29.0")
 bazel_dep(name = "platforms", version = "0.0.7")

diff --git a/docs/BUILD.bazel b/docs/BUILD.bazel
@@ -31,6 +31,11 @@ stardoc_with_diff_test(
     bzl_library_target = "//py/private:py_pex_binary",
 )
 
+stardoc_with_diff_test(
+    name = "py_image_layer",
+    bzl_library_target = "//py/private:py_image_layer",
+)
+
 stardoc_with_diff_test(
     name = "venv",
     bzl_library_target = "//py/private:py_venv",

diff --git a/docs/py_image_layer.md b/docs/py_image_layer.md
diff --git a/py/BUILD.bazel b/py/BUILD.bazel
@@ -38,6 +38,7 @@ bzl_library(
         "//py/private:py_wheel",
         "//py/private:virtual",
         "//py/private:py_pex_binary",
+        "//py/private:py_image_layer",
         "@aspect_bazel_lib//lib:utils",
     ],
 )
diff --git a/py/defs.bzl b/py/defs.bzl
@@ -38,12 +38,13 @@ python.toolchain(python_version = "3.9", is_default = True)
 load("@aspect_bazel_lib//lib:utils.bzl", "propagate_common_rule_attributes")
 load("//py/private:py_binary.bzl", _py_binary = "py_binary", _py_test = "py_test")
 load("//py/private:py_executable.bzl", "determine_main")
+load("//py/private:py_image_layer.bzl", _py_image_layer = "py_image_layer")
 load("//py/private:py_library.bzl", _py_library = "py_library")
 load("//py/private:py_pex_binary.bzl", _py_pex_binary = "py_pex_binary")
 load("//py/private:py_pytest_main.bzl", _py_pytest_main = "py_pytest_main")
 load("//py/private:py_unpacked_wheel.bzl", _py_unpacked_wheel = "py_unpacked_wheel")
-load("//py/private:virtual.bzl", _resolutions = "resolutions")
 load("//py/private:py_venv.bzl", _py_venv = "py_venv")
+load("//py/private:virtual.bzl", _resolutions = "resolutions")
 
 py_pex_binary = _py_pex_binary
 py_pytest_main = _py_pytest_main
@@ -54,6 +55,8 @@ py_test_rule = _py_test
 py_library = _py_library
 py_unpacked_wheel = _py_unpacked_wheel
 
+py_image_layer = _py_image_layer
+
 resolutions = _resolutions
 
 def _py_binary_or_test(name, rule, srcs, main, deps = [], resolutions = {}, **kwargs):

diff --git a/py/private/BUILD.bazel b/py/private/BUILD.bazel
@@ -22,6 +22,14 @@ exports_files(
     visibility = ["//docs:__pkg__"],
 )
 
+bzl_library(
+    name = "py_image_layer",
+    srcs = ["py_image_layer.bzl"],
+    deps = [
+        "@aspect_bazel_lib//lib:tar",
+    ],
+)
+
 bzl_library(
     name = "py_binary",
     srcs = ["py_binary.bzl"],

diff --git a/py/private/py_image_layer.bzl b/py/private/py_image_layer.bzl
@@ -0,0 +1,156 @@
+"""py_image_layer macro for creating multiple layers from a py_binary
+
+> [!WARNING]
+> This macro is EXPERIMENTAL and is not subject to our SemVer guarantees.
+
+A py_binary that uses `torch` and `numpy` can use the following layer groups:
+
+```
+load("@rules_oci//oci:defs.bzl", "oci_image")
+load("@aspect_rules_py//py:defs.bzl", "py_image_layer", "py_binary")
+
+py_binary(
+    name = "my_app_bin",
+    deps = [
+        "@pip_deps//numpy",
+        "@pip_deps//torch"
+    ]
+)
+
+oci_image(
+    tars = py_image_layer(
+        name = "my_app",
+        py_binary = ":my_app_bin",
+        layer_groups = {
+            "torch": "pip_deps_torch.*",
+            "numpy": "pip_deps_numpy.*",
+        }
+    )
+)
+```
+"""
+
+load("@aspect_bazel_lib//lib:tar.bzl", "mtree_spec", "tar")
+
+default_layer_groups = {
+    # match *only* external pip like repositories that contain the string "site-packages"
+    "packages": "\\.runfiles/.*/site-packages",
+    # match *only* external repositories that begins with the string "python"
+    # e.g. this will match
+    #   `/hello_world/hello_world_bin.runfiles/rules_python~0.21.0~python~python3_9_aarch64-unknown-linux-gnu/bin/python3`
+    # but not match
+    #   `/hello_world/hello_world_bin.runfiles/_main/python_app`
+    "interpreter": "\\.runfiles/python.*-.*/",
+}
+
+def _split_mtree_into_layer_groups(name, root, groups, group_names, **kwargs):
+    mtree_begin_blocks = "\n".join([
+        'print "#mtree" >> "$(RULEDIR)/%s.%s.manifest.spec";' % (name, gn)
+        for gn in group_names
+    ])
+
+    # When an mtree entry matches a layer group, it will be moved into the mtree
+    # for that group.
+    ifs = "\n".join([
+        """\
+if ($$1 ~ "%s") {
+    print $$0 >> "$(RULEDIR)/%s.%s.manifest.spec";
+    next
+}""" % (regex, name, gn)
+        for (gn, regex) in groups.items()
+    ])
+
+    cmd = """\
+awk < $< 'BEGIN {
+    %s
+}
+{
+    # Exclude .whl files from container images
+    if ($$1 ~ ".whl") {
+        next
+    }
+    # Move everything under the specified root
+    sub(/^/, ".%s")
+    # Match by regexes and write to the destination.
+    %s
+    # Every line that did not match the layer groups will go into the default layer.
+    print $$0 >> "$(RULEDIR)/%s.default.manifest.spec"
+}'
+""" % (mtree_begin_blocks, root, ifs, name)
+
+    native.genrule(
+        name = "_{}_manifests".format(name),
+        srcs = [name + ".manifest"],
+        outs = [
+            "{}.{}.manifest.spec".format(name, group_name)
+            for group_name in group_names
+        ],
+        cmd = cmd,
+        **kwargs
+    )
+
+
+def py_image_layer(name, py_binary, root = None, layer_groups = {}, compress = "gzip", tar_args = ["--options", "gzip:!timestamp"], **kwargs):
+    """Produce a separate tar output for each layer of a python app
+
+    > Requires `awk` to be installed on the host machine/rbe runner.
+
+    For better performance, it is recommended to split the output of a py_binary into multiple layers.
+    This can be done by grouping files into layers based on their path by using the `layer_groups` attribute.
+
+    The matching order for layer groups is as follows:
+        1. `layer_groups` are checked first.
+        2. If no match is found for `layer_groups`, the `default layer groups` are checked.
+        3. Any remaining files are placed into the default layer.
+    
+    The default layer groups are:
+    ```
+    {
+        "packages": "\\.runfiles/.*/site-packages",, # contains third-party deps
+        "interpreter": "\\.runfiles/python.*-.*/", # contains the python interpreter
+    }
+    ```
+
+    Args:
+        name: base name for targets
+        py_binary: a py_binary target
+        root: Path to where the layers should be rooted. If not specified, the layers will be rooted at the workspace root.
+        layer_groups: Additional layer groups to create. They are used to group files into layers based on their path. In the form of: ```{"<name>": "regex_to_match_against_file_paths"}```
+        compress: Compression algorithm to use. Default is gzip. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule
+        tar_args: Additional arguments to pass to the tar rule. Default is `["--options", "gzip:!timestamp"]`. See: https://github.com/bazel-contrib/bazel-lib/blob/main/docs/tar.md#tar_rule
+        **kwargs: attribute that apply to all targets expanded by the macro
+
+    Returns:
+        A list of labels for each layer.
+    """
+    if root != None and not root.startswith("/"):
+        fail("root path must start with '/' but got '{root}', expected '/{root}'".format(root = root))
+
+    # Produce the manifest for a tar file of our py_binary, but don't tar it up yet, so we can split
+    # into fine-grained layers for better pull, push and remote cache performance.
+    mtree_spec(
+        name = name + ".manifest",
+        srcs = [py_binary],
+        **kwargs
+    )
+
+    groups = dict(**layer_groups)
+    group_names = groups.keys() + ["default"]
+
+    _split_mtree_into_layer_groups(name, root, groups, group_names, **kwargs)
+
+    # Finally create layers using the tar rule
+    result = []
+    for group_name in group_names:
+        tar_target = "_{}_{}".format(name, group_name)
+        tar(
+            name = tar_target,
+            srcs = [py_binary],
+            mtree = "{}.{}.manifest.spec".format(name, group_name),
+            compress = compress,
+            args = tar_args,
+            **kwargs
+        )
+        result.append(tar_target)
+
+    return result
diff --git a/py/repositories.bzl b/py/repositories.bzl
@@ -30,11 +30,12 @@ def rules_py_dependencies():
         url = "https://github.com/bazelbuild/bazel-skylib/archive/refs/tags/1.5.0.tar.gz",
     )
 
+    # py_image_layer requires 2.x for the `tar` rule.
     http_archive(
         name = "aspect_bazel_lib",
-        sha256 = "6e6f8ac3c601d6df25810cd51e51d85831e3437e873b152c5c4ecd3b96964bc8",
-        strip_prefix = "bazel-lib-1.42.3",
-        url = "https://github.com/aspect-build/bazel-lib/archive/refs/tags/v1.42.3.tar.gz",
+        sha256 = "f93d386d8d0b0149031175e81df42a488be4267c3ca2249ba5321c23c60bc1f0",
+        strip_prefix = "bazel-lib-2.9.1",
+        url = "https://github.com/bazel-contrib/bazel-lib/releases/download/v2.9.1/bazel-lib-v2.9.1.tar.gz",
     )
 
     http_archive(

diff --git a/py/toolchains.bzl b/py/toolchains.bzl
@@ -1,12 +1,12 @@
 """Declare toolchains"""
 
+load("@aspect_bazel_lib//lib:repositories.bzl", "register_tar_toolchains")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")
 load("//py/private/toolchain:autodetecting.bzl", _register_autodetecting_python_toolchain = "register_autodetecting_python_toolchain")
 load("//py/private/toolchain:repo.bzl", "prerelease_toolchains_repo", "toolchains_repo")
 load("//py/private/toolchain:tools.bzl", "TOOLCHAIN_PLATFORMS", "prebuilt_tool_repo")
 load("//tools:version.bzl", "IS_PRERELEASE")
 
-
 register_autodetecting_python_toolchain = _register_autodetecting_python_toolchain
 
 DEFAULT_TOOLS_REPOSITORY = "rules_py_tools"
@@ -19,6 +19,9 @@ def rules_py_toolchains(name = DEFAULT_TOOLS_REPOSITORY, register = True, is_pre
         register: whether to call the register_toolchains, should be True for WORKSPACE and False for bzlmod.
         is_prerelease: True iff there are no pre-built tool binaries for this version of rules_py
     """
+
+    register_tar_toolchains(register = register)
+
     if is_prerelease:
         prerelease_toolchains_repo(name = name)
         if register: