Skip to content

Commit

Permalink
init: make bst-init an external executable
Browse files Browse the repository at this point in the history
This commit splits out bst-init into its own executable. This has a
bunch of interesting properties for bst:

First, it allows us to rewrite the process cmdline without doing
anything too crazy (*cough* PR_SET_MM_MAP), which helps tools
distinguish bst from its init process

Second, we don't need to explicitly mark the init as dumpable to let
inner processes with root privileges look at /proc/1/*.

Third, this lets us implement --init, which allows user to specify an
init process of their choosing, should the behaviour of *bst-init* not be
adapted to the situation they're in.

This supersedes #12.
  • Loading branch information
Snaipe committed Aug 4, 2020
1 parent b5e91a3 commit 9b80e65
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 50 deletions.
20 changes: 13 additions & 7 deletions Makefile
@@ -1,15 +1,16 @@
PREFIX ?= /usr
BINDIR ?= $(PREFIX)/bin
DATADIR ?= $(PREFIX)/share
LIBEXECDIR ?= $(PREFIX)/libexec
MANDIR ?= $(DATADIR)/man

CFLAGS ?= -O2
CFLAGS += -std=c99 -Wall -Wextra -Wno-unused-parameter -fno-strict-aliasing
CPPFLAGS += -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64
CPPFLAGS += -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -DLIBEXECDIR=\"$(LIBEXECDIR)\"

SRCS := main.c enter.c outer.c mount.c cp.c setarch.c usage.c sig.c timens.c path.c kvlist.c init.c net.c capable.c
SRCS := main.c enter.c outer.c mount.c cp.c setarch.c usage.c sig.c timens.c path.c kvlist.c net.c capable.c
OBJS := $(subst .c,.o,$(SRCS))
BINS := bst bst-unpersist
BINS := bst bst-unpersist bst-init

ifeq ($(shell id -u),0)
SUDO =
Expand Down Expand Up @@ -44,6 +45,9 @@ bst: $(OBJS)
$(SETCAP) cap_setuid,cap_setgid,cap_dac_override,cap_sys_admin,cap_sys_ptrace,cap_sys_chroot+p $@ \
|| ($(CHOWN) root $@ && $(CHMOD) u+s $@)

bst-init: init.o
$(LINK.o) -o $@ $^

bst-unpersist: unpersist.o capable.o
$(LINK.o) -o $@ $^
$(SETCAP) cap_sys_admin+p $@ \
Expand All @@ -52,18 +56,20 @@ bst-unpersist: unpersist.o capable.o
%.gz: %.scd
scdoc <$< | gzip -c >$@

man: bst.1.gz bst-unpersist.1.gz
man: bst.1.gz bst-unpersist.1.gz bst-init.1.gz

install: BST_INSTALLPATH = $(DESTDIR)$(BINDIR)/bst
install: $(BINS) man
install -m 755 -D bst $(BST_INSTALLPATH)
install -m 755 -D bst-unpersist $(BST_INSTALLPATH)-unpersist
install -m 755 -D bst-init $(DESTDIR)$(LIBEXECDIR)/bst-init
install -m 644 -D bst.1.gz $(DESTDIR)$(MANDIR)/man1/bst.1.gz
install -m 644 -D bst-unpersist.1.gz $(DESTDIR)$(MANDIR)/man1/bst-unpersist.1.gz
install -m 644 -D bst-init.1.gz $(DESTDIR)$(MANDIR)/man1/bst-init.1.gz
$(SETCAP) cap_setuid,cap_setgid,cap_dac_override,cap_sys_admin,cap_sys_ptrace,cap_sys_chroot+p $(BST_INSTALLPATH) \
|| ($(CHOWN) root $(BST_INSTALLPATH) && $(CHMOD) u+s $(BST_INSTALLPATH))
install -m 755 -D bst-unpersist $(BST_INSTALLPATH)-unpersist
$(SETCAP) cap_sys_admin+p $(BST_INSTALLPATH)-unpersist \
|| ($(CHOWN) root $(BST_INSTALLPATH)-unpersist && $(CHMOD) u+s $(BST_INSTALLPATH)-unpersist)
install -m 644 -D bst.1.gz $(DESTDIR)$(MANDIR)/man1/bst.1.gz
install -m 644 -D bst-unpersist.1.gz $(DESTDIR)$(MANDIR)/man1/bst-unpersist.1.gz

check: export PATH := $(DESTDIR)$(BINDIR):${PATH}
check: $(BINS)
Expand Down
23 changes: 13 additions & 10 deletions bst.1.scd
Expand Up @@ -145,6 +145,18 @@ Users of bst may choose to opt-out of some of the isolation.

You cannot use this option with _--share=time_.

\--init <argv>
Set the init process to be used as parent of *<executable>*. *<argv>* is
a space-delimited argv array, and _argv[0]_ must be an absolute path to a
valid executable in the current filesystem root (in other words, the init
executable does not need to exist in the root specified by _--root_).

If an empty *<argv>* is passed to _--init_, no init process will be
spawed by bst, and *<executable>* will be executed directly.

If bst unshares the pid namespace and no _--init_ is specified, it uses
by default *bst-init*(1).

\--no-fake-devtmpfs
Do not replace devtmpfs mounts with a fake devtmpfs.

Expand All @@ -167,15 +179,6 @@ Users of bst may choose to opt-out of some of the isolation.
By default, *bst* automatically tries to mount a new procfs on top of _/proc_
if it detects it to be on another filesystem than _/_.

\--no-init
Do not use *bst*'s minimal init process.

By default, *bst* automatically spawns a bare-bones init process in a PID
namespace, which only reaps zombies, and immediately terminates when the
spawned process of _<executable>_ exits.

This does nothing when used with _--share=pid_.

\--no-loopback-setup
Do not bring up the _lo_ interface.

Expand All @@ -186,4 +189,4 @@ Users of bst may choose to opt-out of some of the isolation.

# SEE ALSO

*bst-unpersist*(1), *namespaces*(7), *mount*(1), *setarch*(1)
*bst-unpersist*(1), *bst-init*(1), *namespaces*(7), *mount*(1), *setarch*(1)
76 changes: 51 additions & 25 deletions enter.c
Expand Up @@ -95,6 +95,19 @@ static void opts_to_nsactions(const struct entry_settings *opts, int *nsactions)
}
}

#ifndef ARG_MAX
# define ARG_MAX 4096
#endif

static inline size_t append_argv(char **argv, size_t argc, char *arg)
{
if (argc >= ARG_MAX) {
errx(1, "argv too large, a maximum of %zu arguments is supported", (size_t) ARG_MAX);
}
argv[argc] = arg;
return argc + 1;
}

int enter(struct entry_settings *opts)
{
int timens_offsets = -1;
Expand Down Expand Up @@ -394,6 +407,17 @@ int enter(struct entry_settings *opts)
mount_mutables(root, opts->mutables, opts->nmutables);
}

int initfd = -1;
if (opts->init_argv != NULL && opts->init_argv[0] != NULL) {
if (nsactions[SHARE_PID] >= 0) {
errx(1, "cannot specify init process when entering an arbitrary pid namespace");
}
if (!pid_unshare) {
errx(1, "cannot specify init process when not in a pid namespace");
}
initfd = open(opts->init_argv[0], O_PATH);
}

/* Don't chroot if root is "/". This is a better default since it
allows us to run commands that unshare nothing unprivileged. */
if (strcmp(root, "/") != 0) {
Expand Down Expand Up @@ -444,29 +468,6 @@ int enter(struct entry_settings *opts)
umask(opts->umask);
}

if (pid_unshare && !opts->no_init) {
pid_t child = fork();

if (child == -1) {
err(1, "fork");
} else if (child) {

/* bst is effectively a setuid binary. This means that by default,
it has its dumpability set to the value of
/proc/sys/fs/suid_dumpable, which likely changes the ownership
of its own /proc/pid/ directory. This means that we can't use
nsenter and friends to probe this init's /proc/pid/ns.
Setting the dumpable flag fixes this. */
if (prctl(PR_SET_DUMPABLE, 1) == -1) {
err(1, "prctl(PR_SET_DUMPABLE)");
}

init(child);
__builtin_unreachable();
}
}

/* Beyond this point, all capabilities are dropped by the uid/gid change.
Only operations that make sense to be privileged in the context of
the specified credentials (and not the userns root) should be placed
Expand All @@ -487,6 +488,31 @@ int enter(struct entry_settings *opts)
warnx("falling back work directory to /.");
}

execvpe(opts->pathname, opts->argv, opts->envp);
err(1, "execvpe");
if (initfd != -1) {
/* This size estimation is an overkill upper bound, but oh well... */
char *argv[ARG_MAX];
size_t argc = 0;

char *argv0 = opts->init_argv[0] + strlen(opts->init_argv[0]);
for (; argv0 != opts->init_argv[0] && *argv0 != '/'; --argv0) {
continue;
}
++argv0;

argc = append_argv(argv, argc, argv0);
for (char *const *arg = opts->init_argv + 1; *arg; ++arg) {
argc = append_argv(argv, argc, *arg);
}
argc = append_argv(argv, argc, (char *) opts->pathname);
for (char *const *arg = opts->argv + 1; *arg; ++arg) {
argc = append_argv(argv, argc, *arg);
}
append_argv(argv, argc, NULL);

syscall(SYS_execveat, initfd, "", argv, opts->envp, AT_EMPTY_PATH);
err(1, "execveat");
} else {
execvpe(opts->pathname, opts->argv, opts->envp);
err(1, "execvpe");
}
}
1 change: 1 addition & 0 deletions enter.h
Expand Up @@ -46,6 +46,7 @@ struct entry_settings {
const char *pathname;
char *const *argv;
char *const *envp;
char *const *init_argv;
char *root;
char *workdir;

Expand Down
23 changes: 20 additions & 3 deletions init.c
Expand Up @@ -7,15 +7,32 @@
#include <err.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>

#include "init.h"

noreturn void init(pid_t main_child_pid)
int main(int argc, char *argv[], char *envp[])
{
for (int sig = 1; sig <= SIGRTMAX; ++sig) {
signal(sig, SIG_DFL);
if (argc == 1) {
printf("usage: %s <program> [args...]\n", argv[0]);
return 2;
}

if (prctl(PR_SET_NAME, "bst-init") == -1) {
err(1, "prctl(PR_SET_NAME)");
}

pid_t main_child_pid;
if ((main_child_pid = fork()) == -1) {
err(1, "fork");
}

if (!main_child_pid) {
execvpe(argv[1], argv + 1, envp);
err(1, "execvpe");
}

for (;;) {
Expand Down
31 changes: 27 additions & 4 deletions main.c
Expand Up @@ -43,10 +43,10 @@ enum {
OPTION_TIME,
OPTION_PERSIST,
OPTION_UMASK,
OPTION_INIT,
OPTION_NO_FAKE_DEVTMPFS,
OPTION_NO_DERANDOMIZE,
OPTION_NO_PROC_REMOUNT,
OPTION_NO_INIT,
OPTION_NO_LOOPBACK_SETUP,
OPTION_SHARE_DEPRECATED,
};
Expand Down Expand Up @@ -131,12 +131,12 @@ int main(int argc, char *argv[], char *envp[])
{ "time", required_argument, NULL, OPTION_TIME },
{ "persist", required_argument, NULL, OPTION_PERSIST },
{ "umask", required_argument, NULL, OPTION_UMASK },
{ "init", required_argument, NULL, OPTION_INIT },

/* Opt-out feature flags */
{ "no-fake-devtmpfs", no_argument, NULL, OPTION_NO_FAKE_DEVTMPFS },
{ "no-derandomize", no_argument, NULL, OPTION_NO_DERANDOMIZE },
{ "no-proc-remount", no_argument, NULL, OPTION_NO_PROC_REMOUNT },
{ "no-init", no_argument, NULL, OPTION_NO_INIT },
{ "no-loopback-setup", no_argument, NULL, OPTION_NO_LOOPBACK_SETUP },

/* Deprecated flags */
Expand All @@ -150,6 +150,9 @@ int main(int argc, char *argv[], char *envp[])
[CLOCK_BOOTTIME] = "boottime",
};

char *init[512];
size_t init_argc = 0;

char *argv0 = NULL;

int error = 0;
Expand Down Expand Up @@ -340,8 +343,15 @@ int main(int argc, char *argv[], char *envp[])
opts.no_proc_remount = 1;
break;

case OPTION_NO_INIT:
opts.no_init = 1;
case OPTION_INIT:
for (char *arg = strtok(optarg, " "); arg; arg = strtok(NULL, " ")) {
size_t max_args = sizeof (init) / sizeof (*init);
if (init_argc >= max_args-1) {
errx(1, "max number of arguments in --init is %zu", max_args);
}
init[init_argc++] = arg;
}
init[init_argc++] = NULL;
break;

case OPTION_NO_LOOPBACK_SETUP:
Expand Down Expand Up @@ -370,6 +380,19 @@ int main(int argc, char *argv[], char *envp[])
}
}

static char *default_init[] = {
LIBEXECDIR "/bst-init",
NULL,
};

/* Use our own default init if we unshare the pid namespace, and no
--init has been specified. */
if (opts.shares[SHARE_PID] == NULL && init_argc == 0) {
opts.init_argv = default_init;
} else {
opts.init_argv = init;
}

char *default_argv[] = {
"bst"
"sh",
Expand Down
2 changes: 1 addition & 1 deletion usage.txt
Expand Up @@ -30,9 +30,9 @@ Options:
--hostname <host>: Set the host name.
--domainname <domain>: Set the domain name.
--time <name>=<s>[.ns]: Set the time of a specific clock.
--init=<init-argv>: Use the specified outer path as init process.

--no-fake-devtmpfs: Don't replace devtmpfs mounts with fake ones.
--no-derandomize: Don't attempt to reduce randomness sources.
--no-proc-remount: Don't remount the existing /proc in pid namespaces.
--no-init: Don't use the builtin init process.
--no-loopback-setup: Don't bring the lo interface up in network namespaces.

0 comments on commit 9b80e65

Please sign in to comment.