From 728b282d0b46fd63c49a91d1c4c184ffb46b9df9 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 10:14:10 +0000 Subject: [PATCH 1/3] fix(cluster/lookup): qemu sshPort from state, not hardcoded 2222 Lookup hardcoded SSHPort="2222" for the qemu backend, ignoring the sshPort recorded in the provisioner's state sidecar at /.json. Any qemu cluster provisioned with a non-default port (e.g. APP_SSH_PORT=2229 in the appliance build scripts) was unreachable via `y-cluster images load`, `y-cluster ctr`, etc. -- the SSH handshake hit the wrong port and bailed with "no supported methods remain". qemuRunning now reads sshPort from the sidecar alongside the pidfile + key path. Lookup uses it; "" still falls back to 2222 so caches written before sshPort landed in the schema keep working. We do not import the qemu package's full state struct -- that would create a cycle since qemu imports cluster -- so the sidecar reader inlines a one-field anonymous struct against the same JSON shape pkg/provision/qemu/state.go writes. Tests cover the round-trip (state -> qemuRunning -> port), the missing-state fallback path, and field-name drift via TestReadQemuStateSSHPort which pins the "sshPort" JSON tag. Verified live against an appliance-gcp-build cluster running on port 2229: `y-cluster images load - --context=local` streams an OCI archive into containerd successfully. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/cluster/lookup.go | 72 ++++++++++++++++++++++------ pkg/cluster/lookup_test.go | 96 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 15 deletions(-) diff --git a/pkg/cluster/lookup.go b/pkg/cluster/lookup.go index bf42410..2abf686 100644 --- a/pkg/cluster/lookup.go +++ b/pkg/cluster/lookup.go @@ -25,6 +25,7 @@ package cluster import ( "context" + "encoding/json" "errors" "fmt" "os" @@ -123,19 +124,29 @@ func Lookup(ctx context.Context, kubeconfigPath, contextName string) (*LookupRes }, nil } - if alive, sshKey := qemuRunning(clusterName); alive { - // SSH port and user track the qemu provisioner's defaults - // (pkg/provision/config.QEMUConfig.SSHPort default 2222; - // cloud-init creates user `ystack`). A user who set a - // non-default SSHPort needs to fall back to ssh directly - // — detect-via-config-file is a follow-up. + if alive, sshKey, sshPort := qemuRunning(clusterName); alive { + // sshPort comes from the provisioner-written state JSON + // (/.json) so a cluster that was provisioned + // with a non-default sshPort is still reachable. Falling + // back to "2222" -- the qemu provisioner's hardcoded + // default in pkg/provision/config.QEMUConfig -- only + // matters for old caches written before sshPort landed + // in the sidecar; current provisions always include it. + // SSHUser is hardcoded because cloud-init's user-data + // template (pkg/provision/qemu/qemu.go renderCloudInitUserData) + // only ever creates `ystack`. SSHHost is hardcoded + // because qemu always binds host-side port forwards to + // 127.0.0.1. + if sshPort == "" { + sshPort = "2222" + } return &LookupResult{ Backend: BackendQEMU, Context: contextName, ClusterName: clusterName, SSHKey: sshKey, SSHHost: "127.0.0.1", - SSHPort: "2222", + SSHPort: sshPort, SSHUser: "ystack", }, nil } @@ -221,30 +232,61 @@ func dockerContainerRunning(ctx context.Context, name string) (bool, error) { // $Y_CLUSTER_QEMU_CACHE_DIR when set, else ~/.cache/y-cluster-qemu -- // matching qemu.FromConfig's default. The env override exists so // e2e tests can run an isolated cluster under t.TempDir() and -// still have detect/ctr/crictl find it. Returns (true, sshKeyPath) -// on a hit, (false, "") otherwise. -func qemuRunning(name string) (bool, string) { +// still have detect/ctr/crictl find it. +// +// Returns (true, sshKeyPath, sshPort) on a hit; sshPort is read +// from the provisioner-written sidecar /.json so +// callers can reach a cluster that was provisioned with a +// non-default port. sshPort is "" if the sidecar is missing or +// has no sshPort field -- caller falls back to the qemu +// provisioner's default. +// +// Returns (false, "", "") on no-hit. +func qemuRunning(name string) (bool, string, string) { cacheDir := os.Getenv("Y_CLUSTER_QEMU_CACHE_DIR") if cacheDir == "" { home, err := os.UserHomeDir() if err != nil { - return false, "" + return false, "", "" } cacheDir = filepath.Join(home, ".cache", "y-cluster-qemu") } pidPath := filepath.Join(cacheDir, name+".pid") data, err := os.ReadFile(pidPath) if err != nil { - return false, "" + return false, "", "" } var pid int if _, err := fmt.Sscanf(strings.TrimSpace(string(data)), "%d", &pid); err != nil { - return false, "" + return false, "", "" } if !pidAlive(pid) { - return false, "" + return false, "", "" + } + sshKey := filepath.Join(cacheDir, name+"-ssh") + sshPort := readQemuStateSSHPort(filepath.Join(cacheDir, name+".json")) + return true, sshKey, sshPort +} + +// readQemuStateSSHPort reads the `sshPort` field out of the qemu +// provisioner's state sidecar at the given path. Returns "" on +// any failure (missing file, bad JSON, no field) -- the caller +// is expected to fall back to a hardcoded default in that case. +// We only care about one field, so we don't import the qemu +// package's full state struct (which would risk an import cycle +// since qemu imports cluster). +func readQemuStateSSHPort(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + var s struct { + SSHPort string `json:"sshPort"` + } + if err := json.Unmarshal(data, &s); err != nil { + return "" } - return true, filepath.Join(cacheDir, name+"-ssh") + return s.SSHPort } // pidAlive is the stdlib equivalent of `kill -0 `. diff --git a/pkg/cluster/lookup_test.go b/pkg/cluster/lookup_test.go index 6201134..5cfae70 100644 --- a/pkg/cluster/lookup_test.go +++ b/pkg/cluster/lookup_test.go @@ -3,6 +3,7 @@ package cluster import ( "context" "errors" + "fmt" "os" "os/exec" "path/filepath" @@ -90,3 +91,98 @@ func TestLookup_NoBackendMatchesIsErrNotFound(t *testing.T) { t.Fatalf("want ErrNotFound, got %v", err) } } + +// TestReadQemuStateSSHPort pins the JSON shape: the qemu state +// sidecar (pkg/provision/qemu/state.go) marshals SSHPort as +// "sshPort". A field rename without updating the lookup-side +// reader here would silently fall back to the default qemu +// provisioner port and break any cluster provisioned on a +// non-default port -- exactly the regression that motivated +// this code path. +func TestReadQemuStateSSHPort(t *testing.T) { + dir := t.TempDir() + good := filepath.Join(dir, "good.json") + if err := os.WriteFile(good, []byte(`{"version":1,"name":"x","sshPort":"2229"}`), 0o644); err != nil { + t.Fatal(err) + } + if got := readQemuStateSSHPort(good); got != "2229" { + t.Errorf("good: got %q, want %q", got, "2229") + } + + noField := filepath.Join(dir, "no-field.json") + if err := os.WriteFile(noField, []byte(`{"version":1,"name":"x"}`), 0o644); err != nil { + t.Fatal(err) + } + if got := readQemuStateSSHPort(noField); got != "" { + t.Errorf("no-field: got %q, want empty", got) + } + + if got := readQemuStateSSHPort(filepath.Join(dir, "missing.json")); got != "" { + t.Errorf("missing: got %q, want empty", got) + } + + bad := filepath.Join(dir, "bad.json") + if err := os.WriteFile(bad, []byte("not json"), 0o644); err != nil { + t.Fatal(err) + } + if got := readQemuStateSSHPort(bad); got != "" { + t.Errorf("bad-json: got %q, want empty", got) + } +} + +// TestQemuRunning_PortFromState round-trips the discovery: write +// a fake pidfile + state, verify qemuRunning returns the port +// the state encodes (not the hardcoded fallback). Uses a live +// PID (the test process itself) since pidAlive requires one. +func TestQemuRunning_PortFromState(t *testing.T) { + dir := t.TempDir() + t.Setenv("Y_CLUSTER_QEMU_CACHE_DIR", dir) + + name := "y-cluster-test-portfromstate" + pid := os.Getpid() + if err := os.WriteFile(filepath.Join(dir, name+".pid"), + []byte(fmt.Sprintf("%d\n", pid)), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, name+".json"), + []byte(`{"version":1,"name":"`+name+`","sshPort":"33445"}`), 0o644); err != nil { + t.Fatal(err) + } + + alive, sshKey, sshPort := qemuRunning(name) + if !alive { + t.Fatalf("expected alive=true (pid %d is this test process)", pid) + } + if sshPort != "33445" { + t.Errorf("sshPort: got %q, want %q", sshPort, "33445") + } + wantKey := filepath.Join(dir, name+"-ssh") + if sshKey != wantKey { + t.Errorf("sshKey: got %q, want %q", sshKey, wantKey) + } +} + +// TestQemuRunning_PortFallbackWhenStateMissing pins the +// graceful-degrade behaviour: when the pidfile is alive but the +// state JSON isn't there (e.g., a really old cache), qemuRunning +// reports running but returns "" for sshPort. Lookup then +// substitutes the qemu provisioner's default. +func TestQemuRunning_PortFallbackWhenStateMissing(t *testing.T) { + dir := t.TempDir() + t.Setenv("Y_CLUSTER_QEMU_CACHE_DIR", dir) + + name := "y-cluster-test-portfallback" + if err := os.WriteFile(filepath.Join(dir, name+".pid"), + []byte(fmt.Sprintf("%d\n", os.Getpid())), 0o644); err != nil { + t.Fatal(err) + } + // no .json on purpose + + alive, _, sshPort := qemuRunning(name) + if !alive { + t.Fatal("expected alive=true") + } + if sshPort != "" { + t.Errorf("sshPort: got %q, want empty (so caller falls back to default)", sshPort) + } +} From f84fbed2073e913dced944c4ac7d8a7124bbf5fa Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 11:38:20 +0000 Subject: [PATCH 2/3] feat(images/load): create name@sha256:digest alias post-import Closes specs/y-cluster/FEATURE_REQUEST_IMAGES_LOAD_DIGEST_ALIAS.md. After `ctr image import` writes the `:tag` ref, `Load` now creates a digest-keyed alias via `ctr image tag @sha256:`. Without this, deployments pinned in `name:tag@sha256:digest` form (the y-release-image-list-local shape, used by every checkit contain-built workload) fall through to a registry pull -- which fails on cluster-local because the build registry isn't reachable from kubelet outside the cluster. The alias is created via a snapshot diff: list refs before import, list again after, alias the new tag-form refs. We don't parse `ctr image import`'s stdout for the (ref, digest) tuple because that output abbreviates / reformats refs in ways that don't round-trip back to the stored ref name. The post-import `ctr image list` row IS authoritative. The parser accepts any whitespace as a column separator and locates the digest by sha256: prefix rather than column index, so the parse survives ctr column reorderings. Snapshot failures (RunCtr returning an error) surface a warning and skip the alias step rather than failing the whole import. stripTag handles the hostport edge case (`host:port/path:tag`) so refs from a registry running on a non-default port get the colon-separated tag stripped, not the port. Unit-tested. Verified live: a fresh import on the appliance-gcp-build cluster produces both refs in `ctr image list`. Re-importing the same archive logs "no new digest aliases needed" (idempotent). Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/images/load.go | 168 +++++++++++++++++++++++++++++++++++++++- pkg/images/load_test.go | 91 ++++++++++++++++++++++ 2 files changed, 256 insertions(+), 3 deletions(-) create mode 100644 pkg/images/load_test.go diff --git a/pkg/images/load.go b/pkg/images/load.go index d56b81f..efe23ec 100644 --- a/pkg/images/load.go +++ b/pkg/images/load.go @@ -5,6 +5,7 @@ import ( "context" "fmt" "io" + "strings" "go.uber.org/zap" @@ -19,6 +20,23 @@ import ( // touched: callers driving from local build artefacts (e.g. a // `contain` tarball under /tmp) can purge those independently. // +// After import, Load creates a digest-keyed alias for every +// imported `:tag` ref via `ctr image tag +// @sha256:`. Without this alias, deployments that +// pin images in `name:tag@sha256:digest` form (the shape +// y-release-image-list-local emits for reproducibility) fall +// through to a registry pull because kubelet looks the digest +// ref up in containerd's image store, which only knew about +// the tag ref. See +// specs/y-cluster/FEATURE_REQUEST_IMAGES_LOAD_DIGEST_ALIAS.md. +// +// Aliasing uses a snapshot diff (`ctr image list` before vs +// after the import) rather than parsing import stdout: ctr's +// import-progress output abbreviates / reformats refs in ways +// that don't round-trip back to the stored ref name. The +// post-import `ctr image list` row IS the authoritative +// (ref, digest) tuple containerd actually stored. +// // Routing per backend matches the rest of pkg/cluster: // - docker: dockerexec.Exec into the k3s container // - qemu: sshexec.ExecStream over SSH @@ -34,6 +52,13 @@ func Load(ctx context.Context, lr *cluster.LookupResult, archive io.Reader, logg if archive == nil { return fmt.Errorf("nil archive reader") } + + // Snapshot the existing image set so we can identify the + // refs that landed in this import. listRefs returns nil + // on any error -- if the snapshot fails we still run the + // import; we just skip the alias step. + before := listRefSet(ctx, lr) + args := []string{"-n", "k8s.io", "image", "import", "-"} logger.Info("loading image archive", zap.String("backend", string(lr.Backend)), @@ -41,9 +66,6 @@ func Load(ctx context.Context, lr *cluster.LookupResult, archive io.Reader, logg ) var stdout, stderr bytes.Buffer if err := cluster.RunCtr(ctx, lr, args, archive, &stdout, &stderr); err != nil { - // Surface whatever the remote `ctr import` printed — - // "ctr: image-related error: ..." is the most common - // case and worth showing verbatim. return fmt.Errorf("ctr image import: %s%s: %w", stdout.String(), stderr.String(), err) } @@ -52,5 +74,145 @@ func Load(ctx context.Context, lr *cluster.LookupResult, archive io.Reader, logg zap.String("output", stdout.String()), ) } + + if before == nil { + // Pre-import snapshot failed; we can't reliably tell + // what's new. Surface and bail. + logger.Warn("pre-import snapshot failed; skipping digest-alias step") + return nil + } + after := listRefsWithDigests(ctx, lr) + if after == nil { + logger.Warn("post-import snapshot failed; skipping digest-alias step") + return nil + } + aliased := 0 + for _, p := range after { + if before[p.ref] { + continue // existed before this import + } + if strings.Contains(p.ref, "@") { + continue // already digest-form; nothing to alias + } + nameOnly := stripTag(p.ref) + alias := nameOnly + "@" + p.digest + if before[alias] { + continue // alias already exists somehow; skip + } + var tagOut, tagErr bytes.Buffer + tagArgs := []string{"-n", "k8s.io", "image", "tag", p.ref, alias} + if err := cluster.RunCtr(ctx, lr, tagArgs, nil, &tagOut, &tagErr); err != nil { + // Don't fail the whole Load -- the import already + // landed. Log so the operator sees what didn't + // alias and can do it manually if needed. + logger.Warn("ctr image tag (digest alias) failed", + zap.String("ref", p.ref), + zap.String("alias", alias), + zap.String("stderr", tagErr.String()), + zap.Error(err)) + continue + } + aliased++ + logger.Info("digest alias created", + zap.String("ref", p.ref), + zap.String("alias", alias)) + } + if aliased == 0 { + logger.Info("no new digest aliases needed (re-import or already aliased)") + } return nil } + +// importedRef is one (ref, digest) row from `ctr image list`. +// "imported" is a slight misnomer here -- listRefsWithDigests +// returns the post-import snapshot of EVERY image ref, and the +// caller filters by the before-set diff to find the +// just-imported subset. +type importedRef struct { + ref string + digest string +} + +// listRefSet returns the set of all ref names containerd +// has indexed in the k8s.io namespace. Used as the "before" +// snapshot in Load. Returns nil on any error so the caller +// can distinguish "list ran but produced zero rows" (impossible +// in practice -- k3s always has system pause images) from +// "list failed and we have no snapshot". +func listRefSet(ctx context.Context, lr *cluster.LookupResult) map[string]bool { + pairs := listRefsWithDigests(ctx, lr) + if pairs == nil { + return nil + } + set := make(map[string]bool, len(pairs)) + for _, p := range pairs { + set[p.ref] = true + } + return set +} + +// listRefsWithDigests runs `ctr -n k8s.io image list` on the +// cluster node and parses the (ref, digest) tuple per row. +// ctr's tabwriter output is: +// +// REF TYPE DIGEST SIZE PLATFORMS LABELS +// +// We accept any whitespace as a column separator and locate +// the digest by `sha256:`-prefix rather than column index, so +// the parse is resilient to ctr column-reorderings. +// +// Returns nil on any RunCtr failure -- the caller treats that +// as "skip the alias step" rather than failing the whole Load. +func listRefsWithDigests(ctx context.Context, lr *cluster.LookupResult) []importedRef { + var stdout, stderr bytes.Buffer + if err := cluster.RunCtr(ctx, lr, []string{"-n", "k8s.io", "image", "list"}, nil, &stdout, &stderr); err != nil { + return nil + } + return parseImageList(stdout.String()) +} + +// parseImageList is the pure parser exposed for unit tests. +// Skips the header row, blank lines, and any line we can't +// pair to a sha256: digest token. +func parseImageList(output string) []importedRef { + var pairs []importedRef + for _, line := range strings.Split(output, "\n") { + fields := strings.Fields(line) + if len(fields) == 0 || fields[0] == "REF" { + continue + } + ref := fields[0] + var digest string + for _, f := range fields[1:] { + if strings.HasPrefix(f, "sha256:") && len(f) == 7+64 { + digest = f + break + } + } + if digest == "" { + continue + } + pairs = append(pairs, importedRef{ref: ref, digest: digest}) + } + return pairs +} + +// stripTag returns the ref with its `:tag` suffix removed. +// Refs can include a hostport like `host:port/path:tag`, so +// the tag is the colon-separated segment after the LAST slash. +// Refs with no tag pass through unchanged. +func stripTag(ref string) string { + slash := strings.LastIndex(ref, "/") + tail := ref + if slash >= 0 { + tail = ref[slash:] + } + colon := strings.LastIndex(tail, ":") + if colon < 0 { + return ref + } + if slash >= 0 { + return ref[:slash+colon] + } + return ref[:colon] +} diff --git a/pkg/images/load_test.go b/pkg/images/load_test.go new file mode 100644 index 0000000..109faed --- /dev/null +++ b/pkg/images/load_test.go @@ -0,0 +1,91 @@ +package images + +import ( + "testing" +) + +// TestParseImageList pins the (ref, digest) extraction +// against a real `ctr -n k8s.io image list` capture from a +// k3s v1.35 cluster. We accept any whitespace separator and +// locate the digest by sha256: prefix rather than column +// index so the parse survives ctr column reorderings. +func TestParseImageList(t *testing.T) { + cases := []struct { + name string + in string + want []importedRef + }{ + { + name: "header + single row", + in: "REF TYPE DIGEST SIZE PLATFORMS LABELS\n" + + "builds-registry.ystack.svc.cluster.local/yolean/keycloak-v3:local-dev application/vnd.oci.image.index.v1+json sha256:fafacfe13375f62fc0a8303c6c6b6186e755d44f479a161476f4129009eb730b 263.7 MiB linux/amd64,linux/arm64 io.cri-containerd.image=managed\n", + want: []importedRef{{ + ref: "builds-registry.ystack.svc.cluster.local/yolean/keycloak-v3:local-dev", + digest: "sha256:fafacfe13375f62fc0a8303c6c6b6186e755d44f479a161476f4129009eb730b", + }}, + }, + { + name: "two rows including a digest-form ref", + in: "REF\tTYPE\tDIGEST\tSIZE\tPLATFORMS\tLABELS\n" + + "docker.io/yolean/echo:1.0\tapplication/vnd.docker.distribution.manifest.v2+json\tsha256:1111111111111111111111111111111111111111111111111111111111111111\t10 MiB\tlinux/amd64\t-\n" + + "docker.io/yolean/echo@sha256:1111111111111111111111111111111111111111111111111111111111111111\tapplication/vnd.docker.distribution.manifest.v2+json\tsha256:1111111111111111111111111111111111111111111111111111111111111111\t10 MiB\tlinux/amd64\t-\n", + want: []importedRef{ + {ref: "docker.io/yolean/echo:1.0", digest: "sha256:1111111111111111111111111111111111111111111111111111111111111111"}, + {ref: "docker.io/yolean/echo@sha256:1111111111111111111111111111111111111111111111111111111111111111", digest: "sha256:1111111111111111111111111111111111111111111111111111111111111111"}, + }, + }, + { + name: "empty", + in: "", + want: nil, + }, + { + name: "only header", + in: "REF\tTYPE\tDIGEST\n", + want: nil, + }, + { + name: "row missing digest token", + in: "REF\tTYPE\tDIGEST\nfoo\tbar\tbaz\n", + want: nil, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := parseImageList(c.in) + if len(got) != len(c.want) { + t.Fatalf("got %d rows, want %d: %#v", len(got), len(c.want), got) + } + for i := range c.want { + if got[i] != c.want[i] { + t.Errorf("row %d: got %+v, want %+v", i, got[i], c.want[i]) + } + } + }) + } +} + +// TestStripTag covers the hostport edge case +// (`host:port/path:tag`) that a naive `strings.LastIndex(ref, +// ":")` mis-handles by stripping the port instead of the tag. +func TestStripTag(t *testing.T) { + cases := []struct{ in, want string }{ + // Plain ref with tag. + {"foo:bar", "foo"}, + // Hostport-prefixed ref WITH tag. + {"registry.example:5000/path:tag", "registry.example:5000/path"}, + {"localhost:5000/yolean/echo:v1", "localhost:5000/yolean/echo"}, + // Hostport-prefixed ref WITHOUT tag. + {"registry.example:5000/path", "registry.example:5000/path"}, + // Standard cluster-local registry path. + {"builds-registry.ystack.svc.cluster.local/yolean/keycloak-v3:local-dev", "builds-registry.ystack.svc.cluster.local/yolean/keycloak-v3"}, + // No tag at all. + {"plain", "plain"}, + } + for _, c := range cases { + got := stripTag(c.in) + if got != c.want { + t.Errorf("stripTag(%q) = %q, want %q", c.in, got, c.want) + } + } +} From a0df4ef2871d408f7f5e71fb72cd807834158199 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Mon, 4 May 2026 12:10:57 +0000 Subject: [PATCH 3/3] feat(serve): surface config sha + version on noop and in daemon log When `y-cluster serve ensure -c ` returns `noop`, the operator can't tell whether the running daemon is on the expected config or on stale state from a previous invocation without grepping the on-disk snapshot. Same for `serve logs` which doesn't say which y-cluster build / which `-c` paths the daemon was launched with. Three small UX fixes: - EnsureResult carries Digest (full sha256 of the running daemon's normalized config). CLI prints a 12-char short SHA after the action: y-cluster serve noop on :8944 (sha abcd1234ef56) Same config -> same SHA across runs; matching SHA confirms the daemon really is on what the operator just typed. - Daemon startup log line now includes version + pid + configDirs + digest, so `serve logs` answers "what version is this?" and "which -c paths is it on?" without any extra subcommand. - serve.DaemonVersion package variable lets cmd/y-cluster surface its full versionString() (release tag + VCS rev) to pkg/serve. The daemon process is a re-exec of the same binary so main() runs again on the daemon side and the assignment carries through. This is the diagnostic gap that surfaced when an operator suspected a real bug after seeing `noop` from a `serve ensure` that they expected to switch y-kustomize bases. The actual config was unchanged so noop was correct; the absent SHA in the output made it impossible to verify that without inspecting state files. Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/y-cluster/main.go | 8 ++++++++ cmd/y-cluster/serve.go | 23 ++++++++++++++++++++--- pkg/serve/serve.go | 41 +++++++++++++++++++++++++++++++++++++---- pkg/serve/serve_test.go | 13 +++++++++++++ 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/cmd/y-cluster/main.go b/cmd/y-cluster/main.go index 82cd1e1..2e24aad 100644 --- a/cmd/y-cluster/main.go +++ b/cmd/y-cluster/main.go @@ -19,6 +19,7 @@ import ( "github.com/Yolean/y-cluster/pkg/provision/docker" "github.com/Yolean/y-cluster/pkg/provision/multipass" "github.com/Yolean/y-cluster/pkg/provision/qemu" + "github.com/Yolean/y-cluster/pkg/serve" "github.com/Yolean/y-cluster/pkg/yconverge" ) @@ -86,6 +87,13 @@ func versionString() string { } func main() { + // Surface the binary's version to pkg/serve so the daemon + // startup log reports it. The daemon process is a re-exec + // of THIS binary, so main() runs again on the daemon side + // and this assignment is what makes `serve logs` answer + // "which y-cluster build is running". + serve.DaemonVersion = versionString() + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() diff --git a/cmd/y-cluster/serve.go b/cmd/y-cluster/serve.go index 66edbd9..68c541c 100644 --- a/cmd/y-cluster/serve.go +++ b/cmd/y-cluster/serve.go @@ -101,10 +101,13 @@ func serveEnsureCmd() *cobra.Command { // Success status to stdout: makes the line scriptable // (`if y-cluster serve ensure ... | grep -q started`) // and keeps stderr free for warnings the daemon may - // emit later. + // emit later. The short SHA tells the operator WHICH + // config the running daemon is on, which matters most + // when the answer is `noop` and they suspected the + // daemon was on stale state. fmt.Fprintf(cmd.OutOrStdout(), - "y-cluster serve %s on %s\n", - res.Action, formatPorts(res.Ports)) + "y-cluster serve %s on %s (sha %s)\n", + res.Action, formatPorts(res.Ports), shortSHA(res.Digest)) return nil }, } @@ -119,6 +122,20 @@ func serveEnsureCmd() *cobra.Command { // formatPorts renders ports as ":N" for one port or ":N, :M, ..." // for several. Single-port is the common case so we optimise the // reading. +// shortSHA truncates a hex sha256 digest to 12 chars (git +// short-rev convention) for the ensure output. Empty input +// returns "unknown" so the line stays parseable when a +// future Ensure path forgets to populate Digest. +func shortSHA(s string) string { + if s == "" { + return "unknown" + } + if len(s) > 12 { + return s[:12] + } + return s +} + func formatPorts(ports []int) string { if len(ports) == 1 { return fmt.Sprintf(":%d", ports[0]) diff --git a/pkg/serve/serve.go b/pkg/serve/serve.go index dd3fe06..aa4ed2e 100644 --- a/pkg/serve/serve.go +++ b/pkg/serve/serve.go @@ -67,6 +67,14 @@ func Run(ctx context.Context, opts Options) error { return startBackground(ctx, cfgs, paths, opts) } +// DaemonVersion is logged at daemon startup so `serve logs` can +// report which y-cluster build is running. Set by main.go from +// versionString() before any serve subcommand executes; pkg/serve +// itself can't compute the user-facing release tag because that +// lives in cmd/y-cluster's `var version` (overridable via ldflags +// at release time). +var DaemonVersion = "unknown" + // EnsureAction describes what Ensure had to do. type EnsureAction int @@ -102,7 +110,8 @@ func (a EnsureAction) String() string { // actually changed. type EnsureResult struct { Action EnsureAction - Ports []int // every port the daemon now listens on + Ports []int // every port the daemon now listens on + Digest string // sha256 hex of the running daemon's normalized config set } // Ensure launches or restarts the daemon so that the configured @@ -128,7 +137,13 @@ func Ensure(ctx context.Context, opts Options) (EnsureResult, error) { want := Digest(cfgs) have, healthy := inspectRunning(paths, cfgs) if healthy && have == want { - return EnsureResult{Action: EnsureNoop, Ports: ports}, nil + // Surface the digest so an operator who suspects the + // running daemon is on stale config can compare against + // what they expect (the digest of the current `-c` + // directories' YAML state). `noop` was the right answer; + // the digest tells them WHY -- the new request was + // byte-identical to the running config. + return EnsureResult{Action: EnsureNoop, Ports: ports, Digest: want}, nil } // Anything we have to clean up before launching counts as a // restart from the operator's view. That includes a stale @@ -145,7 +160,7 @@ func Ensure(ctx context.Context, opts Options) (EnsureResult, error) { if err := startBackground(ctx, cfgs, paths, opts); err != nil { return EnsureResult{}, err } - return EnsureResult{Action: action, Ports: ports}, nil + return EnsureResult{Action: action, Ports: ports, Digest: want}, nil } // pidfilePresent is the "is there state on disk to clean up?" @@ -358,6 +373,24 @@ func runAsDaemon(parent context.Context, cfgs []*Config, paths StatePaths) (retE logger := newJSONLogger() defer func() { _ = logger.Sync() }() + digest := Digest(cfgs) + configDirs := make([]string, len(cfgs)) + for i, c := range cfgs { + configDirs[i] = c.Dir + } + // Log the binary version + config dirs + digest at startup + // so `y-cluster serve logs` answers "what version is running" + // and "which -c paths is it serving" without the operator + // having to grep state files. These are the questions we + // hit when an unexpected `noop` made us doubt the daemon + // was on the config we thought it was on. + logger.Info("y-cluster serve daemon start", + zap.String("version", DaemonVersion), + zap.Int("pid", os.Getpid()), + zap.Strings("configDirs", configDirs), + zap.String("digest", digest), + ) + if err := WritePidfile(paths.Pid, os.Getpid()); err != nil { logger.Error("write pidfile", zap.Error(err)) return err @@ -366,7 +399,7 @@ func runAsDaemon(parent context.Context, cfgs []*Config, paths StatePaths) (retE _ = os.Remove(paths.Pid) }() - snap := map[string]string{"digest": Digest(cfgs)} + snap := map[string]string{"digest": digest} data, _ := json.Marshal(snap) if err := os.WriteFile(paths.Config, data, 0o600); err != nil { logger.Error("write config snapshot", zap.Error(err)) diff --git a/pkg/serve/serve_test.go b/pkg/serve/serve_test.go index 2dd64b1..e4aac86 100644 --- a/pkg/serve/serve_test.go +++ b/pkg/serve/serve_test.go @@ -233,6 +233,13 @@ func TestEnsure_FirstStartAndNoop(t *testing.T) { if len(res.Ports) != 1 || res.Ports[0] != port { t.Fatalf("ports=%v want [%d]", res.Ports, port) } + // Digest is the sha256 hex of the normalized config -- 64 + // chars. The CLI truncates to 12 for display, but the + // EnsureResult carries the full string so callers that want + // to compare programmatically can. + if len(res.Digest) != 64 { + t.Errorf("digest length: got %d, want 64 (full sha256 hex)", len(res.Digest)) + } res2, err := Ensure(context.Background(), Options{ ConfigDirs: []string{cfgDir}, @@ -245,6 +252,12 @@ func TestEnsure_FirstStartAndNoop(t *testing.T) { if res2.Action != EnsureNoop { t.Fatalf("second ensure: action=%s want noop", res2.Action) } + // Same config, same digest -- the noop branch must + // surface the same value the started branch did, so an + // operator who compares the two outputs sees a match. + if res2.Digest != res.Digest { + t.Errorf("digest noop=%q started=%q; should be identical for same config", res2.Digest, res.Digest) + } } // TestEnsure_RestartWhenStaleStatePresent covers the "daemon already