Skip to content

Commit

Permalink
stage1: getPID: move the logic back to stage1
Browse files Browse the repository at this point in the history
Move the pid logic back to stage1/rootfs/enter/enter.c (revert part of
rkt#826), so the pid logic is an implementation detail of stage1,
as it does not appear in Documentation/devel/stage1-implementors-guide.md
Some stage1 might not have pid: stage1 based on virtual machines.

See rkt#910 and rkt#826

Tests: there are no functional tests to check that the race in rkt#826 is
correctly fixed. However, I tested manually in the following way:

 * Test rkt#1:
     1. Start a pod:
        sudo bin/rkt  --debug --insecure-skip-verify run --interactive docker://busybox
     2. Pretend that rkt is slow and the ppid file is not yet written:
        cd /var/lib/rkt/pods/run/$uuid && mv ppid ppid-disabled
     3. Enter the pod:
        sudo bin/rkt enter abbc9f3d /bin/sh
     4. Notice that the enter command is waiting (without taking much cpu)
     5. Pretend that rkt finished its job:
        cd /var/lib/rkt/pods/run/$uuid && mv ppid-disabled ppid
     6. Notice that the enter command succeeds

 * Test rkt#2:
     1. Repeat steps 1-4 from test rkt#1
     2. Stop the pod: ^]^]^]
     3. Notice that the enter command stops too
        FIXME: does not work yet

 * Test rkt#3:
     1. patch stage1/init/init.go:
        +       ioutil.WriteFile("/tmp/rkt-waiting", []byte(""), 0644)
        +       for {
        +               if _, err := os.Stat("/tmp/rkt-waiting"); os.IsNotExist(err) {
        +                       break
        +               }
        +               time.Sleep(100 * time.Millisecond)
        +       }
     2. Start a pod
     3. Enter the pod
     4. Notice that the enter command is waiting (without taking much cpu)
     5. Let nspawn start:
        rm /tmp/rkt-waiting
     6. Notice that the enter command succeeds
  • Loading branch information
alban committed May 21, 2015
1 parent 8871bd6 commit a39337b
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 71 deletions.
8 changes: 1 addition & 7 deletions rkt/enter.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,6 @@ func runEnter(args []string) (exit int) {
return 1
}

podPID, err := p.getPID()
if err != nil {
stderr("Unable to determine pid for pod %q: %v", pid, err)
return 1
}

imageID, err := getAppImageID(p)
if err != nil {
stderr("Unable to determine image id: %v", err)
Expand All @@ -108,7 +102,7 @@ func runEnter(args []string) (exit int) {

stage1RootFS := s.GetTreeStoreRootFS(stage1ID.String())

if err = stage0.Enter(p.path(), podPID, imageID, stage1RootFS, argv); err != nil {
if err = stage0.Enter(p.path(), imageID, stage1RootFS, argv); err != nil {
stderr("Enter failed: %v", err)
return 1
}
Expand Down
51 changes: 0 additions & 51 deletions rkt/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"os"
"path/filepath"
"sort"
"strings"
"syscall"
"time"

Expand Down Expand Up @@ -732,56 +731,6 @@ func (p *pod) getState() string {
return state
}

// getPID returns the pid of the pod.
func (p *pod) getPID() (pid int, err error) {
// check the "pid" (process ID) file if it exists, and on "ppid"
// (parent process ID). Recent stage1 uses "ppid" but we still
// support older rkt versions or alternative stage1 implementations.

for {
var ppid int

pid, err = p.readIntFromFile("pid")
if err == nil {
return
}

ppid, err = p.readIntFromFile("ppid")
if err == nil {
_, err = os.Stat("/proc/1/task/1/children")
if os.IsNotExist(err) {
return -1, fmt.Errorf("Unable to read /proc/1/task/1/children. Does your kernel have CONFIG_CHECKPOINT_RESTORE?")
}

var b []byte
b, err = ioutil.ReadFile(fmt.Sprintf("/proc/%d/task/%d/children", ppid, ppid))
if err == nil {
children := strings.SplitN(string(b), " ", 2)
if len(children) == 2 && children[1] != "" {
return -1, fmt.Errorf("Too many children of pid %d", ppid)
}
_, err = fmt.Sscanf(children[0], "%d ", &pid)
if err == nil {
return pid, nil
}
}
}

// There's a window between a pod transitioning to run and the pid file being created by stage1.
// The window shouldn't be large so we just delay and retry here. If stage1 fails to reach the
// point of pid file creation, it will exit and p.isRunning() becomes false since we refreshState below.
time.Sleep(time.Millisecond * 100)

if err := p.refreshState(); err != nil {
return -1, err
}

if !os.IsNotExist(err) || !p.isRunning() {
return -1, err
}
}
}

// getStage1Hash returns the hash of the stage1 image used in this pod
func (p *pod) getStage1Hash() (*types.Hash, error) {
s1IDb, err := p.readFile(common.Stage1IDFilename)
Expand Down
6 changes: 2 additions & 4 deletions rkt/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,8 @@ func printStatus(p *pod) error {
}

if !p.isEmbryo && !p.isPreparing && !p.isPrepared && !p.isAbortedPrepare && !p.isGarbage && !p.isGone {
pid, err := p.getPID()
if err != nil {
return err
}
// TODO(alban): FIXME
pid := -1

stats, err := p.getExitStatuses()
if err != nil {
Expand Down
4 changes: 1 addition & 3 deletions stage0/enter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"fmt"
"os"
"path/filepath"
"strconv"
"syscall"

"github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/spec/schema/types"
Expand All @@ -30,7 +29,7 @@ import (
// /enter can expect to have its CWD set to the app root.
// imageID and command are supplied to /enter on argv followed by any arguments.
// stage1Path is the path of the stage1 rootfs
func Enter(cdir string, podPID int, imageID *types.Hash, stage1Path string, cmdline []string) error {
func Enter(cdir string, imageID *types.Hash, stage1Path string, cmdline []string) error {
if err := os.Chdir(cdir); err != nil {
return fmt.Errorf("error changing to dir: %v", err)
}
Expand All @@ -43,7 +42,6 @@ func Enter(cdir string, podPID int, imageID *types.Hash, stage1Path string, cmdl
}

argv := []string{filepath.Join(stage1Path, ep)}
argv = append(argv, strconv.Itoa(podPID))
argv = append(argv, id)
argv = append(argv, cmdline...)
if err := syscall.Exec(argv[0], argv, os.Environ()); err != nil {
Expand Down
122 changes: 116 additions & 6 deletions stage1/rootfs/enter/enter.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,112 @@ static int openpidfd(int pid, char *which) {
return fd;
}

static int get_ppid() {
FILE *fp;
int ppid;
char proc_pid[PATH_MAX];

/* We start in the pod root, where "ppid" should be. */

pexit_if((fp = fopen("ppid", "r")) == NULL && errno != ENOENT,
"Unable to open ppid file");

/* The ppid file might not be written yet. The error is not fatal: we
* can try a bit later. */
if (fp == NULL)
return 0;

pexit_if(fscanf(fp, "%i", &ppid) != 1,
"Unable to read ppid");
fclose(fp);

/* Check if ppid terminated. It's ok if it terminates just after the
* check: it will be detected after. But in the common case, we get a
* better error message. */
(void) snprintf(proc_pid, PATH_MAX, "/proc/%i", ppid);
pexit_if(access(proc_pid, F_OK) != 0,
"The pod has terminated (ppid=%i)", ppid);

return ppid;
}

static int pod_is_running() {
/* TODO(alban): check if the lock on the directory is taken... */
return 1;
}

static int get_pid(void) {
FILE *fp;
int ppid = 0, pid = 0;
int ret;
char proc_children[PATH_MAX];
char proc_exe1[PATH_MAX];
char proc_exe2[PATH_MAX];
char link1[PATH_MAX];
char link2[PATH_MAX];

do {
pexit_if((ppid = get_ppid()) == -1,
"Unable to get ppid");

if (ppid > 0)
break;

if (ppid == 0 && pod_is_running()) {
usleep(100 * 1000);
continue;
}
} while (pod_is_running());

pexit_if(access("/proc/1/task/1/children", F_OK) != 0,
"Unable to read /proc/1/task/1/children. Does your kernel have CONFIG_CHECKPOINT_RESTORE?");

(void) snprintf(proc_children, PATH_MAX, "/proc/%i/task/%i/children", ppid, ppid);

do {
pexit_if((fp = fopen(proc_children, "r")) == NULL,
"Unable to open '%s'", proc_children);

errno = 0;
ret = fscanf(fp, "%i ", &pid);
pexit_if(errno != 0,
"Unable to find children of process %i", ppid);
fclose(fp);

if (ret == 1)
break;

if (ret == 0) {
usleep(100 * 1000);
continue;
}
} while (pod_is_running());

if (pid <= 0)
return pid;

/* Ok, we have the correct ppid and pid.
*
* But /sbin/init in the pod might not have been exec()ed yet and so it
* might not have done the chroot() yet. Wait until the pod is ready
* to be entered, otherwise we might chroot() to the wrong directory!
*/
(void) snprintf(proc_exe1, PATH_MAX, "/proc/%i/exe", ppid);
(void) snprintf(proc_exe2, PATH_MAX, "/proc/%i/exe", pid);
do {
pexit_if(readlink(proc_exe1, link1, PATH_MAX) == -1,
"Cannot read link '%s'", proc_exe1);
pexit_if(readlink(proc_exe2, link2, PATH_MAX) == -1,
"Cannot read link '%s'", proc_exe1);
if (strcmp(link1, link2) == 0) {
usleep(100 * 1000);
continue;
}
} while (0);

return pid;
}

int main(int argc, char *argv[])
{
int fd;
Expand All @@ -55,10 +161,14 @@ int main(int argc, char *argv[])
int status;
int root_fd;

exit_if(argc < 4,
"Usage: %s pid imageid cmd [args...]", argv[0])
/* The parameters list is part of stage1 ABI and is specified in
* Documentation/devel/stage1-implementors-guide.md */
exit_if(argc < 3,
"Usage: %s imageid cmd [args...]", argv[0])

pexit_if((pid = get_pid()) == -1,
"Unable to get the pod process leader");

pid = atoi(argv[1]);
root_fd = openpidfd(pid, "root");

#define ns(_typ, _nam) \
Expand Down Expand Up @@ -87,7 +197,7 @@ int main(int argc, char *argv[])
"Unable to fork");

/* some stuff make the argv->args copy less cryptic */
#define ENTER_ARGV_FWD_OFFSET 3
#define ENTER_ARGV_FWD_OFFSET 2
#define DIAGEXEC_ARGV_FWD_OFFSET 6
#define args_fwd_idx(_idx) \
((_idx - ENTER_ARGV_FWD_OFFSET) + DIAGEXEC_ARGV_FWD_OFFSET)
Expand All @@ -101,11 +211,11 @@ int main(int argc, char *argv[])
/* Child goes on to execute /diagexec */

exit_if(snprintf(root, sizeof(root),
"/opt/stage2/%s/rootfs", argv[2]) == sizeof(root),
"/opt/stage2/%s/rootfs", argv[1]) == sizeof(root),
"Root path overflow");

exit_if(snprintf(env, sizeof(env),
"/rkt/env/%s", argv[2]) == sizeof(env),
"/rkt/env/%s", argv[1]) == sizeof(env),
"Env path overflow");

args[0] = "/diagexec";
Expand Down

0 comments on commit a39337b

Please sign in to comment.