From 7c764d45997721705e43ac66fbb5a56747d00d40 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 16 May 2023 14:57:31 +0200 Subject: [PATCH] switch-root: always use MS_BIND to move api vfs over MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We previously would use MS_MOVE to move the old procfs, sysfs, /dev/ and /run to the new place in some places, and MS_BIND in others. The logic when to use MS_MOVE and when to use MS_BIND was pretty arbitrary so far: we'd use MS_MOVE during the initrd → host transition and MS_BIND when transitioning from host into the exitrd during shutdown. Traditionally, using MS_MOVE was preferable, because we didn't bother with unmounting the old mount hierarchy before the switch root, and thus using MS_MOVE did some clean-up as side-effect (because the old mounts went away this way). But since we nowadays properly umount all remaining mount points (since 268d1244e87a35ff8dff56c92ef375ebf69d462e) when transitioning it's pointless. Let's just use MS_BIND always. Let's tweak it though: let's use MS_BIND|MS_REC for the kernel API VFS, and MS_BIND without MS_REC for /run/. The latter reflects the fact that the submounts /run/ has usually are not so much about just accessing kernel APIs but about auxiliary user resources. Hence let's only move the main mount over for that. While we are at it, also set up the base filesystem *before* we move the mounts from the old to the new root, since the base filesystem setup logic creates various needed inodes for us, which we really should make use of instead of creating on our own. --- src/core/main.c | 2 -- src/shared/switch-root.c | 61 +++++++++++++++++++++++++--------------- src/shared/switch-root.h | 2 +- src/shutdown/shutdown.c | 6 +--- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/core/main.c b/src/core/main.c index dc7e4a9767403..28cfca25b3eff 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1838,10 +1838,8 @@ static int do_reexecute( } if (switch_root_dir) { - /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */ r = switch_root(/* new_root= */ switch_root_dir, /* old_root_after= */ NULL, - MS_MOVE, /* destroy_old_root= */ objective == MANAGER_SWITCH_ROOT); if (r < 0) log_error_errno(r, "Failed to switch root, trying to continue: %m"); diff --git a/src/shared/switch-root.c b/src/shared/switch-root.c index b1cbcc5e91715..2ea65eafaf1dd 100644 --- a/src/shared/switch-root.c +++ b/src/shared/switch-root.c @@ -10,6 +10,7 @@ #include "base-filesystem.h" #include "chase.h" +#include "creds-util.h" #include "fd-util.h" #include "initrd-util.h" #include "log.h" @@ -27,15 +28,26 @@ int switch_root(const char *new_root, const char *old_root_after, /* path below the new root, where to place the old root after the transition; may be NULL to unmount it */ - unsigned long mount_flags, /* MS_MOVE or MS_BIND used for /proc/, /dev/, /run/, /sys/ */ bool destroy_old_root) { + struct { + const char *path; + unsigned long mount_flags; + } transfer_table[] = { + { "/dev", MS_BIND|MS_REC }, /* Recursive, because we want to save the original /dev/shm + /dev/pts and similar */ + { "/sys", MS_BIND|MS_REC }, /* Similar, we want to retain various API VFS, or the cgroupv1 /sys/fs/cgroup/ tree */ + { "/proc", MS_BIND|MS_REC }, /* Similar */ + { "/run", MS_BIND }, /* Stuff mounted below this we don't save, as it might have lost its relevance, i.e. credentials, removable media and such, we rather want that the new boot mounts this fresh */ + { SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND }, /* Credentials passed into the system should survive */ + { ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND }, /* Similar */ + { "/run/host", MS_BIND|MS_REC }, /* Host supplied hierarchy should also survive */ + }; + _cleanup_close_ int old_root_fd = -EBADF, new_root_fd = -EBADF; _cleanup_free_ char *resolved_old_root_after = NULL; int r, istmp; assert(new_root); - assert(IN_SET(mount_flags, MS_MOVE, MS_BIND)); /* Check if we shall remove the contents of the old root */ old_root_fd = open("/", O_DIRECTORY|O_CLOEXEC); @@ -83,32 +95,35 @@ int switch_root(const char *new_root, if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) return log_error_errno(errno, "Failed to set \"/\" mount propagation to private: %m"); - FOREACH_STRING(path, "/sys", "/dev", "/run", "/proc") { - _cleanup_free_ char *chased = NULL; - - r = chase(path, new_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &chased, NULL); - if (r < 0) - return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, path); - if (r > 0) { - /* Already exists. Let's see if it is a mount point already. */ - r = path_is_mount_point(chased, NULL, 0); - if (r < 0) - return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", chased); - if (r > 0) /* If it is already mounted, then do nothing */ - continue; - } else - /* Doesn't exist yet? */ - (void) mkdir_p_label(chased, 0755); - - if (mount(path, chased, NULL, mount_flags, NULL) < 0) - return log_error_errno(errno, "Failed to mount %s to %s: %m", path, chased); - } - /* Do not fail if base_filesystem_create() fails. Not all switch roots are like base_filesystem_create() wants * them to look like. They might even boot, if they are RO and don't have the FS layout. Just ignore the error * and switch_root() nevertheless. */ (void) base_filesystem_create_fd(new_root_fd, new_root, UID_INVALID, GID_INVALID); + FOREACH_ARRAY(transfer, transfer_table, ELEMENTSOF(transfer_table)) { + _cleanup_free_ char *chased = NULL; + + if (access(transfer->path, F_OK) < 0) { + log_debug_errno(errno, "Path '%s' to move to target root directory, not found, ignoring: %m", transfer->path); + continue; + } + + r = chase(transfer->path, new_root, CHASE_PREFIX_ROOT, &chased, NULL); + if (r < 0) + return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, transfer->path); + + /* Let's see if it is a mount point already. */ + r = path_is_mount_point(chased, NULL, 0); + if (r < 0) + return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", chased); + if (r > 0) /* If it is already mounted, then do nothing */ + continue; + + r = mount_nofollow_verbose(LOG_ERR, transfer->path, chased, NULL, transfer->mount_flags, NULL); + if (r < 0) + return r; + } + if (fchdir(new_root_fd) < 0) return log_error_errno(errno, "Failed to change directory to %s: %m", new_root); diff --git a/src/shared/switch-root.h b/src/shared/switch-root.h index cba84e9b01b63..9882d2ef91ef3 100644 --- a/src/shared/switch-root.h +++ b/src/shared/switch-root.h @@ -3,4 +3,4 @@ #include -int switch_root(const char *new_root, const char *old_root_after, unsigned long mount_flags, bool destroy_old_root); +int switch_root(const char *new_root, const char *old_root_after, bool destroy_old_root); diff --git a/src/shutdown/shutdown.c b/src/shutdown/shutdown.c index dc713e529f2ed..bd6ded5529ca4 100644 --- a/src/shutdown/shutdown.c +++ b/src/shutdown/shutdown.c @@ -165,14 +165,10 @@ static int switch_root_initramfs(void) { if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0) return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m"); - /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors. - * /run/initramfs/shutdown will take care of these. - * Also do not detach the old root, because /run/initramfs/shutdown needs to access it. - */ + /* Do not detach the old root, because /run/initramfs/shutdown needs to access it. */ return switch_root( /* new_root= */ "/run/initramfs", /* old_root_after= */ "/oldroot", - MS_BIND, /* destroy_old_root= */ false); }