Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ curator.results

# Ignore all .tar files
# Manually running of some example files may create local .tar files
*.tar
*.tar

.claude
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1023,7 +1023,7 @@ The configuration may also specify additional tags to add to the image:
# See docs/global-configuration.rst for more information on these attributes.
#enabled: false
#scanner: "trivy"
#version: "latest"
#version: "0.69.3"
# NOTE: Any configuration provided here will be merged with global/command line config
#config:
# optional-param: val1
Expand Down
112 changes: 112 additions & 0 deletions buildrunner/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
Copyright 2026 Adobe
All Rights Reserved.

NOTICE: Adobe permits you to use, modify, and distribute this file in accordance
with the terms of the Adobe license agreement accompanying it.
"""

import atexit
import os
import signal
import sys

# Global registry of container IDs started by this buildrunner process.
# Used by the signal handler to force-remove containers on abort.
#
# Design notes:
# - No threading.Lock: signal handlers in Python run in the main thread and
# must not acquire locks (deadlock risk). Python's GIL makes list.append()
# and list snapshot (list(...)) atomic enough for this use case.
# - No logging in signal handler: logging acquires internal locks.
# - _cleanup_done is a simple bool flag; only the main thread writes it.

_registered_containers: list[str] = []
_docker_client = None
_cleanup_done = False


def set_docker_client(client) -> None:
"""Set the Docker client used for cleanup. Call from main thread after client init."""
global _docker_client
_docker_client = client


def register_container(container_id: str) -> None:
"""Register a container for cleanup on signal/exit."""
if container_id and container_id not in _registered_containers:
_registered_containers.append(container_id)


def unregister_container(container_id: str) -> None:
"""Unregister a container after successful removal."""
try:
_registered_containers.remove(container_id)
except ValueError:
pass


def _force_cleanup_all() -> None:
"""Force-remove all registered containers.

Called from:
- atexit hook (normal exit, unhandled exception)
- signal handler (SIGTERM/SIGINT from CI system aborting the build)

Safe to call multiple times; only the first call does work.
"""
global _cleanup_done
if _cleanup_done:
return
_cleanup_done = True

# Snapshot the list — no lock needed, GIL protects list copy
containers = list(_registered_containers)
if not containers:
return

client = _docker_client
if not client:
try:
import docker as docker_module

client = docker_module.from_env()
except Exception:
# No way to reach Docker — print to stderr (signal-safe)
print(
f"buildrunner cleanup: cannot create Docker client, "
f"{len(containers)} container(s) may be orphaned",
file=sys.stderr,
)
return

print(
f"buildrunner cleanup: removing {len(containers)} container(s)",
file=sys.stderr,
)

for cid in containers:
try:
client.remove_container(cid, force=True, v=True)
except Exception:
# Container may already be removed by normal cleanup — that's fine
pass


def _signal_handler(signum, _frame):
"""Handle SIGTERM/SIGINT by cleaning up containers then exiting.

Avoids logging, locks, and complex operations to remain signal-safe.
Uses os._exit() instead of sys.exit() to prevent finally blocks from
running (which would race with the cleanup we just did).
"""
_force_cleanup_all()
# os._exit() skips finally blocks and atexit — we already cleaned up
os._exit(128 + signum)


def install_signal_handlers() -> None:
"""Install SIGTERM/SIGINT handlers and atexit hook for container cleanup."""
signal.signal(signal.SIGTERM, _signal_handler)
signal.signal(signal.SIGINT, _signal_handler)
atexit.register(_force_cleanup_all)
12 changes: 11 additions & 1 deletion buildrunner/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
BuildRunner,
BuildRunnerConfigurationError,
)
from buildrunner import config, loggers
from buildrunner import config, docker as br_docker, loggers
from buildrunner.cleanup import install_signal_handlers, set_docker_client
from buildrunner.config import BuildRunnerConfig
from buildrunner.utils import epoch_time

Expand Down Expand Up @@ -417,8 +418,17 @@ def main():
print(__version__)
return os.EX_OK

# Install signal handlers so that SIGTERM/SIGINT (e.g. from CI system aborting
# a build) triggers cleanup of all Docker containers started by this process.
install_signal_handlers()

try:
build_runner = initialize_br(args)
# Give the cleanup module access to the Docker client for signal-time cleanup
try:
set_docker_client(br_docker.new_client())
except Exception:
pass # Cleanup will create its own client if needed
if not args.print_generated_files:
build_runner.run()
if build_runner.exit_code:
Expand Down
2 changes: 1 addition & 1 deletion buildrunner/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class GlobalSecurityScanConfig(BaseModel, extra="forbid"):

enabled: bool = False
scanner: str = "trivy"
version: str = "latest"
version: str = "0.69.3"
# The local cache directory for the scanner (used if supported by the scanner)
cache_dir: Optional[str] = Field(None, alias="cache-dir")
config: dict = {
Expand Down
12 changes: 8 additions & 4 deletions buildrunner/docker/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os


from buildrunner.cleanup import register_container, unregister_container
from buildrunner.docker import DOCKER_DEFAULT_DOCKERD_URL

DAEMON_IMAGE_NAME = "busybox:latest"
Expand Down Expand Up @@ -92,6 +93,7 @@ def start(self):
else None,
)["Id"]
self.docker_client.start(self._daemon_container)
register_container(self._daemon_container)
self.log.write(
f"Created Docker daemon container {self._daemon_container:.10}\n"
)
Expand All @@ -104,12 +106,14 @@ def stop(self):
self.log.write(
f"Destroying Docker daemon container {self._daemon_container:.10}\n"
)
try:
if self._daemon_container:
if self._daemon_container:
try:
self.docker_client.remove_container(
self._daemon_container,
force=True,
v=True,
)
except Exception as _ex:
self.log.write(f"Failed to remove Docker daemon container: {_ex}\n")
except Exception as _ex:
self.log.write(f"Failed to remove Docker daemon container: {_ex}\n")
finally:
unregister_container(self._daemon_container)
4 changes: 4 additions & 0 deletions buildrunner/docker/multiplatform_image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from python_on_whales import docker
from retry import retry

from buildrunner.cleanup import register_container, unregister_container
from buildrunner.config import BuildRunnerConfig
from buildrunner.config.models import MP_LOCAL_REGISTRY
from buildrunner.docker import get_dockerfile
Expand Down Expand Up @@ -166,6 +167,7 @@ def _start_local_registry(self):
if self._docker_registry:
image = f"{self._docker_registry}/{image}"
container = docker.run(image, detach=True, publish_all=True)
register_container(container.name)
ports = container.network_settings.ports

# If any assert fails something changed in the registry image and we need to update this code
Expand Down Expand Up @@ -199,6 +201,8 @@ def _stop_local_registry(self):
LOGGER.error(
f"Failed to stop and remove local registry {self._mp_registry_info.name}: {err}"
)
finally:
unregister_container(self._mp_registry_info.name)
self._local_registry_is_running = False
else:
LOGGER.warning("Local registry is not running when attempting to stop it")
Expand Down
4 changes: 4 additions & 0 deletions buildrunner/docker/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import timeout_decorator

from buildrunner import BuildRunnerConfig
from buildrunner.cleanup import register_container, unregister_container
from buildrunner.docker import (
new_client,
force_remove_container,
Expand Down Expand Up @@ -296,6 +297,7 @@ def start(
# start the container
self.container = self.docker_client.create_container(self.image_name, **kwargs)
self.docker_client.start(self.container["Id"])
register_container(self.container["Id"])

# run any supplied provisioners
if provisioners:
Expand Down Expand Up @@ -361,6 +363,8 @@ def cleanup(self):
print(
f'Unable to delete docker container with id "{self.container["Id"]}"'
)
finally:
unregister_container(self.container["Id"])

self.container = None

Expand Down
7 changes: 5 additions & 2 deletions buildrunner/sshagent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
)
from paramiko.agent import AgentSSH
from paramiko.common import io_sleep
from paramiko.util import asbytes
from paramiko.message import Message

import buildrunner.docker.builder
from buildrunner.cleanup import register_container, unregister_container
from buildrunner.errors import (
BuildRunnerConfigurationError,
BuildRunnerProcessingError,
Expand Down Expand Up @@ -152,6 +152,7 @@ def start(self, keys):
self.docker_client.start(
self._ssh_agent_container,
)
register_container(self._ssh_agent_container)
self.log.write(f"Created ssh-agent container {self._ssh_agent_container:.10}\n")

_ssh_host = "localhost"
Expand Down Expand Up @@ -268,6 +269,8 @@ def stop(self):
)
except Exception as _ex:
self.log.write(f"Error destroying ssh-agent container: {_ex}\n")
finally:
unregister_container(self._ssh_agent_container)

def get_ssh_agent_image(self):
"""
Expand Down Expand Up @@ -420,7 +423,7 @@ def _send_reply(self, msg):
"""
Send a reply back to the upstream agent.
"""
raw_msg = asbytes(msg)
raw_msg = msg.asbytes()
length = struct.pack(">I", len(raw_msg))
self._remote_channel.send(length + raw_msg)

Expand Down
17 changes: 12 additions & 5 deletions buildrunner/steprunner/tasks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import python_on_whales

import buildrunner.docker
from buildrunner.cleanup import register_container, unregister_container
from buildrunner.config import BuildRunnerConfig
from buildrunner.config.models_step import RunAndServicesBase, StepRun, Service
from buildrunner.docker.daemon import DockerDaemonProxy
Expand Down Expand Up @@ -116,6 +117,7 @@ def _get_source_container(self):
self._docker_client.start(
self._source_container,
)
register_container(self._source_container)
self.step_runner.log.info(
f"Created source container {self._source_container:.10}"
)
Expand Down Expand Up @@ -1206,11 +1208,16 @@ def cleanup(self, context): # pylint: disable=unused-argument
self.step_runner.log.info(
f"Destroying source container {self._source_container:.10}"
)
self._docker_client.remove_container(
self._source_container,
force=True,
v=True,
)
try:
self._docker_client.remove_container(
self._source_container,
force=True,
v=True,
)
except Exception as _ex:
self.step_runner.log.info(f"Error removing source container: {_ex}")
finally:
unregister_container(self._source_container)

for image in self.images_to_remove:
python_on_whales.docker.image.remove(image, force=True)
Expand Down
54 changes: 54 additions & 0 deletions docs/common-issues.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,60 @@ A couple of potential solutions are:

Neither of these solutions are perfect, but both significantly shrink the chance of encountering the race condition.

Container cleanup on build abort
#################################

When a build is interrupted (e.g. a CI system sends ``SIGTERM`` to abort a superseded PR build),
buildrunner needs to clean up the Docker containers it started. Without proper cleanup, containers
running ``/usr/sbin/init`` (systemd) or ``/run.sh`` (sshd) will remain running indefinitely,
accumulating over time and consuming disk, memory, and network resources on the build agent.

How cleanup works
=================

Buildrunner tracks every Docker container it starts in a global registry. Cleanup happens in two ways:

1. **Normal exit**: Each build step has a ``finally`` block that calls ``cleanup()`` on all containers
it created (the build container, service containers, SSH agent, Docker daemon proxy, and source
container). These ``finally`` blocks run on normal completion, exceptions, and ``sys.exit()``.

2. **Signal-based cleanup**: A ``SIGTERM`` or ``SIGINT`` handler is installed at startup. When the
signal is received, the handler force-removes all registered containers via
``docker remove --force`` and then calls ``os._exit()`` to terminate immediately. This covers
the case where the process is killed externally before ``finally`` blocks can run.

The signal handler uses ``os._exit()`` (not ``sys.exit()``) to avoid triggering ``finally`` blocks
after cleanup has already been performed, which would cause race conditions and double-removal
attempts.

Limitations
===========

- ``SIGKILL`` (``kill -9``) cannot be caught by any handler. If the process is killed with
``SIGKILL``, containers will be orphaned. CI systems should always send ``SIGTERM`` first and
allow time for cleanup before escalating to ``SIGKILL``.

- Containers started *inside* a buildrunner container (e.g. via testcontainers or direct
``docker run`` commands within a build step) are not tracked by the cleanup registry. These
child containers must be cleaned up by the code that started them.

- If the Docker daemon is unreachable when the signal handler runs, cleanup will fail silently
and a warning will be printed to stderr.

Diagnosing orphaned containers
==============================

Containers started by buildrunner are labeled with the labels passed via ``--container-labels``.
To find orphaned buildrunner containers on an agent:

.. code:: bash

# List all containers with buildrunner labels (adjust label key to match your setup)
docker ps -a --filter label=com.example.build.source=buildrunner

# Force-remove all orphaned buildrunner containers
docker ps -q --filter label=com.example.build.source=buildrunner | xargs -r docker rm -f

Utilizing multi-platform base images
####################################

Expand Down
Loading
Loading