From 5c5fb7e9badb5310eb125661635e5d6bc2892cd8 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 09:42:32 +0000 Subject: [PATCH 1/7] feat(provision/qemu): mount-required seed gate + APPLIANCE_MAINTENANCE doc Two squashed commits: - adds APPLIANCE_MAINTENANCE.md (the appliance maintenance design doc) - implements the mount-required seed gate so the data-seed unit fails fast when /data/yolean is not yet mounted, the cloud-init bypass on customer first boot, and aligned k3s.service drop-in. Fields: - pkg/provision/qemu/data_seed.{service,go} + data_seed_check.sh + k3s_data_seed.conf -- the seed unit + check script + k3s drop-in - pkg/provision/qemu/prepare_inguest.sh -- cloud-init bypass - pkg/provision/qemu/lifecycle.go + prepare_export_test.go + data_seed_test.go -- coverage and diagnostic-start path - cmd/y-cluster/manifests.go -- staged-manifest auto-apply at customer first boot - e2e/qemu_test.go -- e2e cover for the mount-required gate Co-Authored-By: Claude Opus 4.7 (1M context) --- APPLIANCE_MAINTENANCE.md | 476 ++++++++++++++++++++++ cmd/y-cluster/manifests.go | 4 +- e2e/qemu_test.go | 214 +++++++++- pkg/provision/qemu/data_seed.go | 2 +- pkg/provision/qemu/data_seed.service | 10 +- pkg/provision/qemu/data_seed_check.sh | 102 ++++- pkg/provision/qemu/data_seed_test.go | 292 ++++++++----- pkg/provision/qemu/k3s_data_seed.conf | 2 +- pkg/provision/qemu/lifecycle.go | 52 ++- pkg/provision/qemu/prepare_export_test.go | 24 ++ pkg/provision/qemu/prepare_inguest.sh | 36 +- 11 files changed, 1069 insertions(+), 145 deletions(-) create mode 100644 APPLIANCE_MAINTENANCE.md diff --git a/APPLIANCE_MAINTENANCE.md b/APPLIANCE_MAINTENANCE.md new file mode 100644 index 0000000..b8abf6b --- /dev/null +++ b/APPLIANCE_MAINTENANCE.md @@ -0,0 +1,476 @@ +# Appliance maintenance + +How y-cluster preserves customer data across appliance changes, and the +two mechanisms it provides for the appliance builder: **first-boot data +seeding** (handled at OS-level by a systemd unit) and **k3s manifests +staging** (build-time-staged manifests applied at customer-cluster +boot). + +The doc is organised around the three phases an appliance goes through: +the customer's initial import, the supplier-side build of a new +appliance version, and the customer's later upgrade onto that new +version. + +## Lifecycle overview + +``` + ┌────────────────────────────────────────────────────┐ + │ │ + build (run by us per customer) customer (boots the disk) │ + │ │ + ┌────────────▼──────┐ ┌──────────────▼────────────┐│ + │ y-cluster │ │ Customer attaches their ││ + │ provision │ │ persistent data drive at ││ + │ │ │ /data/yolean (per the ││ + │ install workloads │ │ bundle README), boots. ││ + │ (kubectl/yconverge) │ ││ + │ │ │ ┌─────────────────────┐ ││ + │ /data/yolean now │ │ │ y-cluster-data-seed │ ││ + │ holds DB schemas, │ │ │ detects empty │ ││ + │ kafka topics, │ │ │ external mount, │ ││ + │ init markers... │ │ │ extracts seed │ ││ + │ │ │ └─────────┬───────────┘ ││ + │ y-cluster │ │ ▼ ││ + │ manifests add │ │ ┌─────────────────────┐ ││ + │ │ │ │ k3s starts │ ││ + │ (stages a Job for │ │ │ auto-applies every │ ││ + │ the customer's │ │ │ manifest staged at │ ││ + │ FIRST boot) │ │ │ build time │ ││ + │ │ │ │ (Jobs are idempotent│ ││ + │ y-cluster stop │ │ │ -- a Job named for │ ││ + │ prepare-export │ │ │ v0.5.0 only runs │ ││ + │ export │ │ │ the v0.4.0->v0.5.0 │ ││ + │ │ │ │ migration once) │ ││ + │ -> tarball │ │ └─────────────────────┘ ││ + └───────────────────┘ └───────────────────────────┘│ + │ + Upgrade (new appliance disk, same customer): │ + │ + ┌──────────────────────────────────┐ │ + │ Customer boots v0.5.0 disk with │ │ + │ existing /data/yolean drive ───► │ y-cluster-data-seed sees │ + │ │ marker, NO-OP │ + │ │ │ + │ │ k3s starts, auto-applies │ + │ │ staged manifests; new │ + │ │ Job names trigger their │ + │ │ one-time migration logic. │ + │ │ Already-applied Job names │ + │ │ are no-ops. │ + └──────────────────────────────────┘ │ + │ + ▼ +``` + +The customer's lived experience: attach the appliance disk, attach the +data disk, boot. Subsequent upgrades = swap the appliance disk, keep +the data disk, boot. No commands. No state to migrate by hand. + +## Phase 1: First import + +The customer's first boot of a fresh appliance, ahead of the supplier +having shipped any subsequent upgrades. + +### Supplier side + +The supplier builds the v1 appliance disk: + +1. `y-cluster provision -c ` to stand up a build-side cluster. +2. Install workloads (kubectl / yconverge / helm). The build cluster + populates `/data/yolean/` with the build-time output of init Jobs: + database schemas, Kafka topic configs, file-backed PVs, etc. +3. (Optional, normally none on a v1 build) `y-cluster manifests add` + to stage any one-shot Jobs the customer's first boot should run. +4. `y-cluster stop` for a clean quiesce. +5. `y-cluster prepare-export` — virt-customize-driven identity reset + (machine-id, ssh host keys, cloud-init clean), netplan generic-NIC + match, systemd-timesyncd enable, **build the data seed** (see + Mechanism 1 below), **move staged manifests** into k3s's + auto-apply directory. +6. `y-cluster export --format=...` packs the result for + the target hypervisor (qcow2 / raw / vmdk / ova / gcp-tar). + +### Customer side + +1. Format an ext4 volume with the universal LABEL the appliance + expects: + ```sh + sudo mkfs.ext4 -L y-cluster-data /dev/ + ``` + Attach it to the imported VM. `prepare-export` pre-baked + `LABEL=y-cluster-data /data/yolean ext4 defaults,nofail 0 2` + into `/etc/fstab`, so first boot mounts the volume automatically; + the customer does not edit fstab themselves. Cross-hypervisor: + VMware / VirtualBox / Hetzner / GCP all expose ext4 labels the + same way. +2. Boot the appliance disk. +3. `y-cluster-data-seed.service` runs Before=k3s.service, + After=cloud-init.service, sees the external mount is empty, + **extracts the seed** into the customer's drive. Writes + `/data/yolean/.y-cluster-seeded` last (so a crashed extract is + detectable on next boot). +4. k3s starts, scans `/var/lib/rancher/k3s/server/manifests/`, applies + any staged manifests. (For a v1 build there typically aren't any.) +5. Workloads come up against the now-populated `/data/yolean`. + +### What if the customer forgets to attach the volume? + +`fstab` carries `nofail`, so boot continues. `data_seed_check.sh` +sees `/data/yolean` is not a mountpoint, fails the unit, and k3s +stays down. sshd is unaffected (no transitive dependency), so the +customer SSHes in and reads: + +```sh +sudo journalctl -u y-cluster-data-seed.service -b +``` + +The journal carries the actionable resolution recipe (attach + label +the volume, reboot, or mount + restart the unit). + +### Hosting-automation bypass (NOT for customers) + +Hosting automation can override the mount-required gate by writing +`/run/y-cluster-seed-bypass` (tmpfs) before the seed unit runs. The +canonical path is via the appliance's user_data, which cloud-init +delivers via the NoCloud datasource (Hetzner Cloud, multipass, our +qemu provisioner, etc): + +```yaml +#cloud-config +write_files: + - path: /run/y-cluster-seed-bypass + permissions: '0644' + content: "" +``` + +When the bypass flag is present, the seed extracts into whatever +`/data/yolean` is (typically the boot disk's directory if the fstab +mount soft-failed). A sibling `/data/yolean/.y-cluster-seeded-via-bypass` +sentinel records that the bypass path was taken — the in-memory flag +itself is gone after the next reboot, but the marker on disk still +controls the seed unit's no-op decision. + +The customer never sets this. `/run` is tmpfs (no on-disk persistence), +and the only entity with cloud-init injection access is the entity +that creates the VM. Bare-metal / VMware / VirtualBox imports have +no cloud-init datasource at all by default, so the branch is +unreachable. + +## Phase 2: Upgrade (supplier side) + +How the supplier builds v(N+1), assuming customers exist on v(N). + +The build flow is the SAME provision → install → manifests add → +stop → prepare-export → export sequence as Phase 1. What's +different: + +- The supplier runs the v(N) testdata against the v(N+1) workload set, + exercising whatever schema/topic/initContainer changes need to be + smooth-migrated. +- Migration Jobs go in via `y-cluster manifests add migrate-vN-vN1-... `. + These accumulate across versions: a v0.6.0 build can stage *both* + the v0.4→v0.5 and the v0.5→v0.6 migration Jobs by name; k3s on the + customer side applies whichever ones haven't already run. +- The data seed re-built on this build represents v(N+1)'s baseline. + Customers with existing data ignore it (marker present → no-op); + fresh customers (a NEW customer importing for the first time on + v(N+1)) get v(N+1)'s baseline directly. + +### Migration Job authoring contract + +A migration Job is the supplier's vehicle for changing customer data +from v(N) to v(N+1). Shape: + +- `metadata.name` is the source-target version pair, e.g. + `migrate-v0.5.0-userdb-add-tenants`. K3s's apply-on-restart + semantics give one-time execution per name (already-Completed Jobs + with that name are not recreated). +- Pre-gated by an InitContainer that waits for the workloads it depends + on (`kubectl wait deployment/mariadb --for=condition=Available`). +- Idempotent in its own logic: the migration script checks for a marker + (a ConfigMap, or a file under `/data/yolean/.migrations/`) before + doing anything destructive. +- Optional: the Job's pod mounts `/data/yolean` directly (via a PVC + bound to a host-path, OR via a `hostPath` volume) when the + migration needs raw filesystem access AND the workloads are still + down. + +Skeleton: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: migrate-v0.5.0-userdb-add-tenants + namespace: customer-app +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + restartPolicy: OnFailure + initContainers: + - name: wait-mariadb + image: bitnami/kubectl + command: ["kubectl","wait","deployment/mariadb", + "--for=condition=Available","--timeout=5m", + "--namespace=customer-app"] + containers: + - name: migrate + image: + env: + - name: FROM_VERSION + value: v0.4.0 + - name: TO_VERSION + value: v0.5.0 + command: ["/migrate.sh"] +``` + +## Phase 3: Upgrade + +The customer swaps the appliance disk while keeping the data drive. + +### Customer side + +1. Power the appliance off (`shutdown`, ideally graceful — see drain + note in "Open considerations" if Galera-class StatefulSets are in + the stack). +2. Detach the v(N) appliance disk; attach the v(N+1) appliance disk. + The data drive at `/data/yolean` stays put. +3. Boot. +4. `y-cluster-data-seed.service` sees the marker on `/data/yolean`, + no-ops (the existing data is what we want preserved). +5. k3s starts, reads `/var/lib/rancher/k3s/server/manifests/`. New + migration Job names trigger; existing names are no-ops. +6. Workloads come up against the customer's preserved `/data/yolean`, + migration Jobs apply their changes. + +The customer issues no commands. + +### Rollback + +If a v(N+1) migration fails, the customer reattaches the v(N) disk +with the same data drive. The seed mechanism's marker-respect logic +means the data drive is untouched on either appliance. Workloads +resume against whatever state the migration left behind — which means +a partial / broken migration is on the supplier to design defensively +(idempotent + marker-gated; see Migration Job contract above). + +A more explicit rollback marker pattern is on the open list. + +## Mechanism 1: data-dir seeding (`y-cluster-data-seed.service`) + +### Problem + +The build cluster populates `/data/yolean/` with the build-time output +of init Jobs (database schemas, Kafka topic configs, file-backed PVs +from echo / VersityGW / customer workloads). The appliance disk ships +with that data on its boot filesystem. + +When the customer boots and mounts THEIR persistent data drive at +`/data/yolean`, the mount obscures the boot-disk's `/data/yolean`. The +customer's drive starts empty. Workloads find nothing; init Jobs that +were "already done" on the build side haven't run on the customer +side. Setup is lost. + +### Solution + +`prepare-export` snapshots `/data/yolean/` into +`/var/lib/y-cluster/data-seed.tar.zst` (which lives on the appliance +disk's root, NOT under `/data/yolean`, so it's not obscured by the +customer's mount). It also pre-bakes +`LABEL=y-cluster-data /data/yolean ext4 defaults,nofail 0 2` into +`/etc/fstab` so the customer's only attach step is `mkfs.ext4 -L +y-cluster-data /dev/...` — no fstab edit, no hypervisor-specific +device path. + +At boot, a oneshot systemd unit runs Before=k3s.service, +After=cloud-init.service, with the following decision (in order): + +1. **`/run/y-cluster-seed-bypass` exists** → bypass branch: extract + regardless of mount state, drop a sibling `.y-cluster-seeded-via-bypass` + sentinel. Hosting-automation only; customers never get here. +2. **`/data/yolean` is NOT a mountpoint** → fail the unit. k3s stays + down. The customer is meant to attach a labeled volume; the + journal explains how. Eliminates the customer-mounts-after-k3s + race (the original GCP-appliance failure mode). +3. **Marker `/data/yolean/.y-cluster-seeded` present** → no-op + (respect existing state, upgrade fast path). +4. **Mountpoint empty (excluding `lost+found`)** → extract the seed, + then write the marker. +5. **Mountpoint non-empty, no marker** → REFUSE TO SEED. Fail the + unit loudly. k3s does not start. + +sshd has no dependency on this unit and starts normally regardless +of seed outcome — the customer can always SSH in to recover. + +### Empty defined + +A directory is "empty" iff it has no entries other than `lost+found` +(the kernel creates this on every freshly-formatted ext4). Anything +else is a conflict — we won't clobber data the customer didn't tell us +about. + +### Marker + +`/data/yolean/.y-cluster-seeded` is JSON: + +```json +{ + "schemaVersion": 1, + "seeded_at": "2026-05-04T12:30:00Z", + "seeded_by": "y-cluster v0.4.0 (abc1234)", + "appliance_name": "appliance-gcp-build", + "seed_sha256": "sha256:c7e3...8a2f" +} +``` + +`seed_sha256` is the digest of `/var/lib/y-cluster/data-seed.tar.zst` +that was the source of this seed. Future appliance versions can +compare the customer's marker against the new seed's sha to detect +whether the data they're upgrading has the same baseline as the new +appliance was built against (decision input for migration jobs; see +Mechanism 2). + +### Never overwrites actual data + +Four layers, in order: + +1. **Marker check first.** Marker present → unconditional no-op. The + upgrade fast path. +2. **Conflict detection.** No marker + non-empty dir → fail unit, log + conflict-resolution recipes (see Troubleshooting). +3. **k3s blocks on the unit.** A drop-in adds + `Requires=y-cluster-data-seed.service` to k3s.service. If seed + fails, k3s won't start — the customer SSHes in and fixes the + situation instead of getting silent partial state. +4. **Marker is written LAST.** A crashed extract leaves no marker; the + next boot detects "non-empty without marker" → conflict mode. The + customer sees something's wrong instead of getting silent + half-seeded state. + +### Troubleshooting + +The customer-side troubleshooting surface, from least to most +intrusive: + +```sh +# What did seed do on the most recent boot? +sudo journalctl -u y-cluster-data-seed.service -b + +# Has the volume been seeded? When? By what? +cat /data/yolean/.y-cluster-seeded + +# Standalone status helper -- prints marker + seed + k3s state. +sudo /usr/local/bin/y-cluster-seed-status + +# Conflict mode: the unit's stderr lists the conflicting entries +# and the two recovery recipes: +# - if data is correct: write the marker manually +# echo '{"schemaVersion":1,...}' | sudo tee /data/yolean/.y-cluster-seeded +# sudo systemctl restart y-cluster-data-seed.service +# - if data is junk: wipe and re-seed +# sudo rm -rf /data/yolean/* /data/yolean/.[!.]* +# sudo systemctl restart y-cluster-data-seed.service +``` + +### Trade-off + +The seed tar adds `du -sh /data/yolean` (compressed via zstd) to the +appliance disk shipped to the customer. For a heavy build (kafka + +mariadb + keycloak + customer workloads with init data) that's +typically 1-3 GB on top of the boot disk. Mitigation if it becomes +painful: selective seeding of ONLY init markers and small config files +(workload entrypoints' detect-and-init logic re-creates the bulk on +first boot). + +## Mechanism 2: manifests staging (`y-cluster manifests add`) + +### Problem + +The appliance builder needs to ship Kubernetes manifests (typically +migration `Job`s, but also `ConfigMap`s, `Secret`s, etc.) that should +apply to the customer's cluster on its first boot — NOT on the build +cluster. + +Naive: `kubectl apply` during build → applies to the build cluster +immediately. Init Jobs run, write to /data/yolean. Migration Jobs that +expect "v0.4.0 schema" fail because the build cluster is freshly +initialized at v0.5.0 schema. Wrong cluster, wrong state. + +### Solution + +`y-cluster manifests add ` writes the manifest into a +staging directory on the cluster node: + +``` +/var/lib/y-cluster/manifests-staging/.yaml +``` + +This directory is NOT auto-applied by k3s. The build cluster doesn't +react. `prepare_inguest.sh` (run during prepare-export) moves the +staged manifests into k3s's auto-apply directory: + +``` +/var/lib/rancher/k3s/server/manifests/.yaml +``` + +On the customer's first boot, k3s scans `manifests/` and applies +everything. Subsequent boots re-apply (server-side apply is idempotent +for non-Job resources, and for Jobs k3s observes the existing +Completed state and doesn't recreate the pod). + +### `` semantics + +The name is the file basename (without `.yaml`). It MUST: +- Match `[a-zA-Z0-9][a-zA-Z0-9._-]*` (no path separators, no `..`). +- Not already exist in the staging directory (the subcommand bails). + +The name is also the source-of-truth identifier for the migration. We +recommend a versioned shape like `migrate-v0.5.0-userdb-add-tenants`. +An identical name in a future appliance build → idempotent re-apply +(no-op). A different name → new migration runs once. + +### Trade-off + +The customer's first boot of a new appliance applies EVERY staged +manifest at once. If a migration Job has a long pre-gate +(`kubectl wait` for slow-starting workloads), that delays the cluster +becoming "ready" for traffic. Mitigation: scope migration Jobs to +non-blocking work where possible; keep the appliance's own startup +fast and leave heavy data migrations to a workload-side scheduled +process. + +## What y-cluster owns vs. what the appliance builder owns + +| | y-cluster | appliance builder | +|---|---|---| +| Build the data seed | ✓ | | +| Boot-time seed extraction | ✓ | | +| Marker semantics, conflict detection | ✓ | | +| Stage manifests on the cluster filesystem | ✓ | | +| Move staged manifests on prepare-export | ✓ | | +| Decide WHAT to migrate | | ✓ | +| Author the migration Job manifest | | ✓ | +| Ship the migration container image | | ✓ | +| Choose the migration name (= idempotency key) | | ✓ | + +y-cluster does not invent a new migration framework. The Kubernetes +Job resource + idempotent name + InitContainer wait pattern is the +contract; y-cluster's job is just to make it possible to ship those +Jobs in an appliance disk that's been built ON ONE CLUSTER but +TARGETED AT ANOTHER. + +## Open considerations (not blocking the first cut) + +- **Selective seeding** — a flag on `prepare-export` to seed only + specific subdirs of `/data/yolean` (e.g., init markers and + config-only files). Lets workloads re-do bulk init on first boot to + shrink the appliance. +- **Migration ordering across multiple Jobs** — if multiple staged + Jobs need to run in order, the appliance builder uses K8s-native + dependency primitives (a wait-for-completion Init container on the + second Job). y-cluster doesn't try to model an ordering DAG itself. +- **Customer-side rollback marker** — today rollback = swap back to + the prior appliance disk. A more explicit "rollback marker" pattern + would let the supplier signal "this migration is reversible by + Job-X" and the customer trigger that without disk swap. diff --git a/cmd/y-cluster/manifests.go b/cmd/y-cluster/manifests.go index e2c6caa..0d6fb8f 100644 --- a/cmd/y-cluster/manifests.go +++ b/cmd/y-cluster/manifests.go @@ -33,8 +33,8 @@ against the build cluster. Typical use: ship a migration Job that runs once on the customer's first boot of a new appliance version. See -specs/y-cluster/APPLIANCE_UPGRADES.md for the recommended Job -shape and idempotency conventions.`, +APPLIANCE_MAINTENANCE.md for the recommended Job shape and +idempotency conventions.`, } cmd.AddCommand(manifestsAddCmd()) return cmd diff --git a/e2e/qemu_test.go b/e2e/qemu_test.go index 8fbded5..59006a0 100644 --- a/e2e/qemu_test.go +++ b/e2e/qemu_test.go @@ -242,13 +242,21 @@ func TestQemu_ExportImport(t *testing.T) { t.Fatal(err) } - // Export to VMDK - vmdkPath := cfg.CacheDir + "/appliance.vmdk" - if err := qemu.ExportVMDK(cluster.DiskPath(), vmdkPath); err != nil { + // Export to VMDK via the bundle-shaped API. The bundle dir + // gets .vmdk, -ssh{,.pub}, and a README.md. + bundleDir := filepath.Join(cfg.CacheDir, "bundle") + if err := qemu.Export(ctx, qemu.ExportOptions{ + CacheDir: cfg.CacheDir, + Name: cfg.Name, + BundleDir: bundleDir, + Format: qemu.FormatVMDK, + Logger: logger, + }); err != nil { t.Fatal(err) } + vmdkPath := filepath.Join(bundleDir, cfg.Name+".vmdk") if _, err := os.Stat(vmdkPath); err != nil { - t.Fatal("VMDK should exist after export") + t.Fatalf("VMDK should exist after export: %v", err) } t.Logf("exported VMDK: %s", vmdkPath) @@ -361,6 +369,204 @@ func TestQemu_StopStart(t *testing.T) { } } +// TestQemu_Seed_GateAndBypass exercises the data-seed mount-required +// gate end-to-end against a real qemu boot: +// +// - Provision a build cluster, plant a sentinel file under +// /data/yolean so the seed has something verifiable. +// - Stop, prepare-export (bakes the LABEL=y-cluster-data fstab +// entry, generates the seed tarball, lays down the systemd unit). +// - Boot the prepared disk in diagnostic mode -- StartForDiagnostic +// gives us a *Cluster without waiting for k3s, which we expect +// not to come up because no labeled volume is attached. +// - Assert: sshd works, the seed unit is in `failed` state, the +// journal mentions "not a mountpoint", k3s.service is NOT active. +// This is the regression posture for the GCP-appliance failure +// where the customer's volume mounted after k3s. +// - Inject /run/y-cluster-seed-bypass (the cloud-init-style hosting +// override; in this test we touch it directly via SSH) and +// restart the seed unit. Assert the seed extract ran, the +// sentinel is back under /data/yolean, the bypass sibling +// sentinel is present, and k3s reaches Ready after a restart. +// +// Covers states 3 (no volume, no bypass -> fail), 4 (no volume + +// bypass -> extract), and 7 (sshd unaffected by seed failure) of +// the 7-state seed-check matrix; states 1, 2, 5, 6 are unit-tested +// via the embedded shell script under pkg/provision/qemu. +func TestQemu_Seed_GateAndBypass(t *testing.T) { + if _, err := os.Stat("/dev/kvm"); err != nil { + t.Skip("QEMU tests require /dev/kvm") + } + if err := qemu.CheckPrerequisites(); err != nil { + t.Skip(err) + } + if _, err := exec.LookPath("virt-customize"); err != nil { + t.Skip("virt-customize not on PATH; install libguestfs-tools") + } + + logger, _ := zap.NewDevelopment() + cfg := e2eQEMURuntime() + cfg.Name = "y-cluster-e2e-seed-gate" + cfg.Context = "y-cluster-e2e-seed-gate" + cfg.CacheDir = t.TempDir() + cfg.Memory = "4096" + cfg.CPUs = "2" + cfg.SSHPort = "2227" + cfg.PortForwards = e2eUniqueForwards("26447") + cfg.Kubeconfig = os.Getenv("KUBECONFIG") + if cfg.Kubeconfig == "" { + t.Skip("KUBECONFIG must be set") + } + t.Setenv("Y_CLUSTER_QEMU_CACHE_DIR", cfg.CacheDir) + + ctx := context.Background() + + // 1. Build the appliance. + cluster, err := qemu.Provision(ctx, cfg, logger) + if err != nil { + t.Fatal(err) + } + // Cleanup runs even on test failure; teardown removes the disk + + // pidfile + ssh key. Idempotent against the second VM (cluster2) + // because it shares CacheDir/Name. + t.Cleanup(func() { _ = cluster.Teardown(false) }) + + // 2. Plant a sentinel under /data/yolean. PrepareExport's + // BuildSeedAssets snapshots /data/yolean into the tarball; the + // sentinel proves end-to-end that the bypass branch's extract + // actually wrote the build-time data back onto the customer + // side. + if out, err := cluster.SSH(ctx, "sudo mkdir -p /data/yolean && echo seed-sentinel-v1 | sudo tee /data/yolean/sentinel.txt >/dev/null"); err != nil { + t.Fatalf("plant sentinel: %s: %v", out, err) + } + + // 3. Stop the build cluster cleanly. + if err := qemu.Stop(cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("Stop: %v", err) + } + + // 4. prepare-export: bake fstab + seed tarball + systemd unit. + if err := qemu.PrepareExport(ctx, cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("PrepareExport: %v", err) + } + + // 5. Boot in diagnostic mode. k3s won't come up because the seed + // unit will fail (no volume attached, no bypass). The Cluster + // handle still gives us SSH against the running VM. + cluster2, err := qemu.StartForDiagnostic(ctx, cfg.CacheDir, cfg.Name, logger) + if err != nil { + t.Fatalf("StartForDiagnostic: %v", err) + } + + // 6. SSH works -- sshd has no transitive dep on the seed unit. + if out, err := cluster2.SSH(ctx, "hostname"); err != nil { + t.Fatalf("SSH after diagnostic boot (sshd should be unaffected by seed failure): %s: %v", out, err) + } + + // 7. Wait for the seed unit to settle. It's oneshot Before=k3s.service, + // runs early; expect it to be `failed` once cloud-init.service has + // completed and the gate has fired. + if state := waitForSeedState(t, ctx, cluster2, "failed", 90*time.Second); state != "failed" { + out, _ := cluster2.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + t.Fatalf("seed unit never reached failed; last state=%q\njournal:\n%s", state, out) + } + + // 8. Journal carries the actionable error. + journalOut, err := cluster2.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + if err != nil { + t.Fatalf("journalctl: %v", err) + } + if !strings.Contains(string(journalOut), "not a mountpoint") { + t.Errorf("journal missing 'not a mountpoint':\n%s", journalOut) + } + if !strings.Contains(string(journalOut), "LABEL=y-cluster-data") { + t.Errorf("journal missing LABEL hint (resolution recipe):\n%s", journalOut) + } + + // 9. k3s must NOT be active. Per the drop-in `Requires=` on the + // failed seed unit, k3s.service stays in "inactive (deps + // failed)" or similar. + k3sOut, _ := cluster2.SSH(ctx, "systemctl is-active k3s.service") + if state := strings.TrimSpace(string(k3sOut)); state == "active" { + t.Fatalf("k3s.service should not be active when seed gate fires; got: %q", state) + } + + // === Bypass path === + + // 10. Inject the bypass flag the way Hetzner QA cloud-init would, + // except via SSH for test simplicity. /run is tmpfs. + if out, err := cluster2.SSH(ctx, "sudo touch /run/y-cluster-seed-bypass"); err != nil { + t.Fatalf("touch bypass flag: %s: %v", out, err) + } + + // 11. Reset the failed state and restart the seed unit. With the + // bypass file in place, the script extracts regardless of + // mount status and exits 0. + if out, err := cluster2.SSH(ctx, "sudo systemctl reset-failed y-cluster-data-seed.service && sudo systemctl restart y-cluster-data-seed.service"); err != nil { + t.Fatalf("restart seed unit after bypass: %s: %v", out, err) + } + if state := waitForSeedState(t, ctx, cluster2, "active", 60*time.Second); state != "active" { + out, _ := cluster2.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + t.Fatalf("seed unit never reached active after bypass; last state=%q\njournal:\n%s", state, out) + } + + // 12. Sentinel from build time must be back under /data/yolean + // (extracted from the seed tarball into the boot-disk dir). + sentOut, err := cluster2.SSH(ctx, "cat /data/yolean/sentinel.txt") + if err != nil { + t.Fatalf("read sentinel after bypass extract: %v", err) + } + if !strings.Contains(string(sentOut), "seed-sentinel-v1") { + t.Errorf("seed extract didn't restore sentinel; got: %s", sentOut) + } + + // 13. Bypass sibling-sentinel marks "we went down the bypass path" + // for forensic visibility. + if out, err := cluster2.SSH(ctx, "test -f /data/yolean/.y-cluster-seeded-via-bypass && echo present"); err != nil { + t.Errorf("bypass sentinel missing: %s: %v", out, err) + } else if !strings.Contains(string(out), "present") { + t.Errorf("bypass sentinel not present: %s", out) + } + + // 14. k3s.service's Requires is now satisfied; restart should + // bring it up. + if out, err := cluster2.SSH(ctx, "sudo systemctl reset-failed k3s.service 2>/dev/null; sudo systemctl restart k3s.service"); err != nil { + t.Fatalf("restart k3s after bypass: %s: %v", out, err) + } + + // 15. Wait for k3s to be Ready via in-VM kubectl (we don't import + // the kubeconfig in diagnostic mode, so use guest-side kubectl). + deadline := time.Now().Add(3 * time.Minute) + for time.Now().Before(deadline) { + nodesOut, _ := cluster2.SSH(ctx, "sudo k3s kubectl get nodes --no-headers 2>/dev/null || true") + if strings.Contains(string(nodesOut), "Ready") { + return + } + time.Sleep(3 * time.Second) + } + out, _ := cluster2.SSH(ctx, "sudo journalctl -u k3s.service -b --no-pager | tail -50") + t.Fatalf("k3s never reached Ready after bypass+restart\nk3s journal tail:\n%s", out) +} + +// waitForSeedState polls `systemctl is-active y-cluster-data-seed.service` +// against the VM until it reports `want` or the timeout fires. Returns +// the last observed state so the caller can include it in the failure +// message. +func waitForSeedState(t *testing.T, ctx context.Context, cluster *qemu.Cluster, want string, timeout time.Duration) string { + t.Helper() + deadline := time.Now().Add(timeout) + last := "" + for time.Now().Before(deadline) { + out, _ := cluster.SSH(ctx, "systemctl is-active y-cluster-data-seed.service 2>/dev/null || true") + last = strings.TrimSpace(string(out)) + if last == want { + return last + } + time.Sleep(2 * time.Second) + } + return last +} + // assertNodeReady polls `kubectl get nodes` against ctx until at // least one Ready node is reported, up to 2 minutes. Shared by // the lifecycle e2e legs. diff --git a/pkg/provision/qemu/data_seed.go b/pkg/provision/qemu/data_seed.go index c666c6c..aa5befd 100644 --- a/pkg/provision/qemu/data_seed.go +++ b/pkg/provision/qemu/data_seed.go @@ -18,7 +18,7 @@ import ( // Embedded assets that travel with the appliance disk and run // on the customer's first boot. See -// specs/y-cluster/APPLIANCE_UPGRADES.md for the design. +// APPLIANCE_MAINTENANCE.md for the design. //go:embed data_seed.service var dataSeedUnit string diff --git a/pkg/provision/qemu/data_seed.service b/pkg/provision/qemu/data_seed.service index 7b16d95..e2cc27d 100644 --- a/pkg/provision/qemu/data_seed.service +++ b/pkg/provision/qemu/data_seed.service @@ -1,10 +1,16 @@ [Unit] Description=y-cluster appliance data-dir first-boot seed DefaultDependencies=no -RequiresMountsFor=/data/yolean -After=local-fs.target systemd-tmpfiles-setup.service +After=local-fs.target systemd-tmpfiles-setup.service cloud-init.service Before=k3s.service ConditionPathExists=/var/lib/y-cluster/data-seed.tar.zst +# No RequiresMountsFor= for /data/yolean: with the fstab nofail +# entry the mount may legitimately not be present (Hetzner QA +# bypass shape, or a customer who forgot to attach their volume), +# and the script handles the missing-mount cases itself. A hard +# Requires would prevent the script from running and reporting +# the actionable failure. After=local-fs.target still ensures +# the fstab mount has been attempted before we look. [Service] Type=oneshot diff --git a/pkg/provision/qemu/data_seed_check.sh b/pkg/provision/qemu/data_seed_check.sh index 03507c5..6fe4ec6 100644 --- a/pkg/provision/qemu/data_seed_check.sh +++ b/pkg/provision/qemu/data_seed_check.sh @@ -1,22 +1,34 @@ #!/bin/sh # y-cluster-data-seed-check: extract the build-time /data/yolean -# snapshot onto a freshly-attached customer disk, or no-op when -# the appliance disk's own /data/yolean is in use, or refuse to -# clobber unrecognised data. +# snapshot onto a freshly-attached customer disk, no-op when an +# already-seeded marker is present, refuse to clobber unrecognised +# data, or fail closed when the customer forgot to attach their +# data volume. +# +# Run by y-cluster-data-seed.service (oneshot) Before=k3s.service, +# After=cloud-init.service. The cloud-init ordering matters because +# hosting automation can write /run/y-cluster-seed-bypass via a +# user_data write_files entry; this unit must run AFTER cloud-init +# has had a chance to drop that file. # -# Run by y-cluster-data-seed.service (oneshot) Before=k3s.service. # k3s.service has a Requires= drop-in pointing here so a failure # blocks the cluster from coming up -- the customer SSHes in, # reads the journal, fixes the situation, and either restarts -# this unit or starts k3s manually. +# this unit or starts k3s manually. sshd is unaffected by this +# unit's failure (no transitive dependency). # -# Decision table: -# /data/yolean is NOT a separate mount -> no-op -# marker present -> no-op -# no marker, dir empty (excl. lost+found) -> extract seed -# no marker, dir non-empty -> fail (conflict) +# Decision table (in order): +# /run/y-cluster-seed-bypass exists -> bypass: extract +# regardless of mount +# /data/yolean is NOT a mountpoint -> fail (production: +# customer must attach +# the data volume) +# marker present -> no-op (upgrade fast +# path) +# mountpoint empty (excl. lost+found) -> extract, write marker +# mountpoint non-empty, no marker -> fail (conflict) # -# See specs/y-cluster/APPLIANCE_UPGRADES.md for the full design. +# See APPLIANCE_MAINTENANCE.md for the full lifecycle design. set -eu @@ -24,13 +36,52 @@ MOUNT=/data/yolean SEED=/var/lib/y-cluster/data-seed.tar.zst META=/var/lib/y-cluster/data-seed.meta.json MARKER="$MOUNT/.y-cluster-seeded" +BYPASS_FLAG=/run/y-cluster-seed-bypass -# 1. Not a separate mount -> data lives on the boot disk, already -# populated by the appliance build. No customer drive attached; -# nothing to seed. We let k3s come up against the boot-disk data. -if ! mountpoint -q "$MOUNT" 2>/dev/null; then - echo "y-cluster-data-seed: $MOUNT is not a separate mount; using appliance boot-disk data, no seed needed." - exit 0 +# 0. Bypass: a tmpfs flag set by hosting automation (cloud-init +# write_files via the server's user_data). Only present when an +# entity with hosting-API access deliberately set it. The customer +# importing onto VMware / VirtualBox / bare metal has no cloud-init +# datasource available, so this branch is unreachable for them. +# Used for the Hetzner QA path where attaching a labeled volume per +# server is awkward; we ship the same bundle and bypass at import. +if [ -e "$BYPASS_FLAG" ]; then + echo "y-cluster-data-seed: bypass flag $BYPASS_FLAG present; extracting regardless of mount state." + BYPASSED=1 +else + BYPASSED=0 +fi + +# 1. Mount required (unless bypassed). prepare-export pre-bakes +# `LABEL=y-cluster-data /data/yolean ext4 defaults,nofail 0 2` in +# /etc/fstab, so any production appliance expects a customer-supplied +# volume. Failing closed here eliminates the customer-mounts-after-k3s +# race we hit on the GCP appliance (redpanda PVC permission-denied, +# mariadb missing grastate.dat). +if [ "$BYPASSED" = "0" ]; then + if ! mountpoint -q "$MOUNT" 2>/dev/null; then + cat >&2 < + Then reboot, OR mount manually and restart this unit: + sudo mount /data/yolean + sudo systemctl restart y-cluster-data-seed.service + - If you intentionally want this appliance to use the boot disk's + /data/yolean (no separate volume), this is a hosting-automation + concern, not a customer one -- inject $BYPASS_FLAG via cloud-init + user_data write_files at provision time. +EOF + exit 1 + fi fi # 2. Marker present -> the customer's drive has been seeded before @@ -50,7 +101,7 @@ if [ -n "$ENTRIES" ]; then cat >&2 <&2 echo "y-cluster-data-seed: cannot proceed -- mark $MARKER manually if the empty mount is intentional." >&2 @@ -86,6 +137,10 @@ if [ ! -f "$META" ]; then exit 1 fi +# In bypass mode the path may not exist yet on disk because the +# fstab mount soft-failed. mkdir -p is a no-op if it already exists. +mkdir -p "$MOUNT" + echo "y-cluster-data-seed: extracting $SEED to $MOUNT" zstdcat "$SEED" | tar -C "$MOUNT" -xpf - echo "y-cluster-data-seed: extracted." @@ -93,10 +148,15 @@ echo "y-cluster-data-seed: extracted." # 5. Write the marker LAST. A crashed extract leaves no marker, so # the next boot detects "non-empty without marker" -> conflict mode # (case 3) and surfaces the problem instead of silently retrying. -# We copy the build-time meta verbatim; it carries the seed sha, -# build version, and timestamp. +# We copy the build-time meta verbatim; in bypass mode we drop a +# sibling sentinel so a future operator can tell at a glance the +# seed went down the bypass path (the BYPASS_FLAG itself is tmpfs +# and gone after the next reboot). cp "$META" "$MARKER" chmod 0644 "$MARKER" +if [ "$BYPASSED" = "1" ]; then + touch "$MOUNT/.y-cluster-seeded-via-bypass" +fi echo "y-cluster-data-seed: seeded $MOUNT successfully." cat "$MARKER" diff --git a/pkg/provision/qemu/data_seed_test.go b/pkg/provision/qemu/data_seed_test.go index 9e24bec..ccc146d 100644 --- a/pkg/provision/qemu/data_seed_test.go +++ b/pkg/provision/qemu/data_seed_test.go @@ -9,17 +9,20 @@ import ( "testing" ) -// runSeedCheck executes the embedded data_seed_check.sh against a -// fake $MOUNT and $SEED setup the caller wires up under tmp. -// -// The script is hardcoded to /data/yolean / /var/lib/y-cluster paths. -// We override them by writing a tiny wrapper that exports the same -// names as shell environment variables and then sources the original -// script with sed-substituted constants. Cheaper than refactoring the -// boot-time script to take env-var paths -- the boot-time script is -// what runs on a real customer machine and shouldn't grow knobs we -// don't need there. -func runSeedCheck(t *testing.T, mount, seed, meta string) (stdout, stderr string, exit int) { +type seedCheckOpts struct { + mount string // -> $MOUNT + seed string // -> $SEED + meta string // -> $META + bypass string // -> $BYPASS_FLAG (defaults to a tmpdir non-existent path; create the file before running to exercise the bypass branch) + forceMount bool // when true, override mountpoint -q to always succeed (simulate "/data/yolean is a mountpoint") +} + +// runSeedCheck executes the embedded data_seed_check.sh against +// caller-supplied paths. The boot-time script hardcodes +// /data/yolean / /var/lib/y-cluster / /run for production; tests +// override each path via sed substitution so we can exercise the +// real branches without root or a real mount. +func runSeedCheck(t *testing.T, opts seedCheckOpts) (stdout, stderr string, exit int) { t.Helper() if runtime.GOOS == "windows" { t.Skip("seed-check is /bin/sh-only") @@ -28,27 +31,42 @@ func runSeedCheck(t *testing.T, mount, seed, meta string) (stdout, stderr string t.Skip("zstd not on PATH") } - // Materialise the script with path substitutions. + // Default the bypass path to something that won't exist unless + // the test explicitly creates it. Tests that want to exercise + // the bypass branch set opts.bypass to a path AND `touch` it. + bypass := opts.bypass + if bypass == "" { + bypass = filepath.Join(t.TempDir(), "no-bypass-flag") + } + + // Path substitutions on the production script. Each replacement + // is anchored to the constant assignment line so a future + // renaming of the literal doesn't silently break the test. src := dataSeedCheckScript - src = strings.Replace(src, "MOUNT=/data/yolean", "MOUNT="+mount, 1) - src = strings.Replace(src, "SEED=/var/lib/y-cluster/data-seed.tar.zst", "SEED="+seed, 1) - src = strings.Replace(src, "META=/var/lib/y-cluster/data-seed.meta.json", "META="+meta, 1) - // The script's mountpoint check uses `mountpoint -q` against - // MOUNT. tmp dirs are not mountpoints, so the test would - // always exit early at "not a separate mount". Replace the - // guard with a TEST_FORCE_MOUNT env-var override so we can - // exercise the real branches. - src = strings.Replace(src, - `if ! mountpoint -q "$MOUNT" 2>/dev/null; then`, - `if [ -z "${TEST_FORCE_MOUNT:-}" ] && ! mountpoint -q "$MOUNT" 2>/dev/null; then`, - 1) + src = strings.Replace(src, "MOUNT=/data/yolean", "MOUNT="+opts.mount, 1) + src = strings.Replace(src, "SEED=/var/lib/y-cluster/data-seed.tar.zst", "SEED="+opts.seed, 1) + src = strings.Replace(src, "META=/var/lib/y-cluster/data-seed.meta.json", "META="+opts.meta, 1) + src = strings.Replace(src, "BYPASS_FLAG=/run/y-cluster-seed-bypass", "BYPASS_FLAG="+bypass, 1) + + // The mountpoint check uses `mountpoint -q` against MOUNT. tmp + // dirs aren't mountpoints, so any test exercising "the mount IS + // present" (states 1, 2, 5) needs to short-circuit the check. + // We slip an env-var override in front of the original guard. + if opts.forceMount { + src = strings.Replace(src, + `if ! mountpoint -q "$MOUNT" 2>/dev/null; then`, + `if [ -z "${TEST_FORCE_MOUNT:-}" ] && ! mountpoint -q "$MOUNT" 2>/dev/null; then`, + 1) + } scriptPath := filepath.Join(t.TempDir(), "seed-check.sh") if err := os.WriteFile(scriptPath, []byte(src), 0o755); err != nil { t.Fatal(err) } cmd := exec.Command("/bin/sh", scriptPath) - cmd.Env = append(os.Environ(), "TEST_FORCE_MOUNT=1") + if opts.forceMount { + cmd.Env = append(os.Environ(), "TEST_FORCE_MOUNT=1") + } var sob, seb strings.Builder cmd.Stdout = &sob cmd.Stderr = &seb @@ -62,8 +80,7 @@ func runSeedCheck(t *testing.T, mount, seed, meta string) (stdout, stderr string } // makeSeedTar writes a small tar.zst at seedPath whose contents are -// the entries (path -> body) given. The sha256 of the resulting -// tarball is returned for verification. +// the entries (path -> body) given. func makeSeedTar(t *testing.T, seedPath string, entries map[string]string) { t.Helper() if _, err := exec.LookPath("zstd"); err != nil { @@ -82,7 +99,6 @@ func makeSeedTar(t *testing.T, seedPath string, entries map[string]string) { t.Fatal(err) } } - // tar -C contentDir -cf - . | zstd > seedPath tarCmd := exec.Command("tar", "-C", contentDir, "-cf", "-", ".") tarOut, err := tarCmd.StdoutPipe() if err != nil { @@ -107,102 +123,86 @@ func makeSeedTar(t *testing.T, seedPath string, entries map[string]string) { } } -// TestSeedCheck_MarkerPresent_NoOp pins the upgrade fast path: the -// customer's drive already has a marker, we respect it and exit 0. -func TestSeedCheck_MarkerPresent_NoOp(t *testing.T) { +// fixture sets up a (mount, seed, meta) triple under t.TempDir() +// so individual tests stay focused on the assertion shape, not the +// boilerplate. +type fixture struct { + dir string + mount string + seed string + meta string +} + +func newFixture(t *testing.T) fixture { + t.Helper() dir := t.TempDir() - mount := filepath.Join(dir, "mount") - if err := os.MkdirAll(mount, 0o755); err != nil { - t.Fatal(err) - } - if err := os.WriteFile(filepath.Join(mount, ".y-cluster-seeded"), - []byte(`{"schemaVersion":1,"existing":"marker"}`), 0o644); err != nil { - t.Fatal(err) + f := fixture{ + dir: dir, + mount: filepath.Join(dir, "mount"), + seed: filepath.Join(dir, "seed.tar.zst"), + meta: filepath.Join(dir, "seed.meta.json"), } - if err := os.WriteFile(filepath.Join(mount, "existing-data.txt"), - []byte("customer's data"), 0o644); err != nil { + if err := os.MkdirAll(f.mount, 0o755); err != nil { t.Fatal(err) } - - stdout, _, exit := runSeedCheck(t, mount, "/nonexistent/seed", "/nonexistent/meta") - if exit != 0 { - t.Errorf("exit: got %d, want 0; stdout=%q", exit, stdout) - } - if !strings.Contains(stdout, "marker present") { - t.Errorf("expected 'marker present' in stdout, got: %s", stdout) - } - // Existing data must be untouched. - body, _ := os.ReadFile(filepath.Join(mount, "existing-data.txt")) - if string(body) != "customer's data" { - t.Errorf("existing data mutated: %q", body) - } + return f } -// TestSeedCheck_EmptyMount_Seeds covers the green path: customer -// attached an empty drive, we extract the seed and write the marker. +// State 1: volume attached, empty mount -> seed extracts, marker written. func TestSeedCheck_EmptyMount_Seeds(t *testing.T) { - dir := t.TempDir() - mount := filepath.Join(dir, "mount") - if err := os.MkdirAll(mount, 0o755); err != nil { + f := newFixture(t) + if err := os.MkdirAll(filepath.Join(f.mount, "lost+found"), 0o755); err != nil { t.Fatal(err) } - // lost+found is allowed and must be ignored. - if err := os.MkdirAll(filepath.Join(mount, "lost+found"), 0o755); err != nil { - t.Fatal(err) - } - - seed := filepath.Join(dir, "seed.tar.zst") - meta := filepath.Join(dir, "seed.meta.json") - makeSeedTar(t, seed, map[string]string{ + makeSeedTar(t, f.seed, map[string]string{ "workload-data/db.txt": "schema=v0.4.0", }) - if err := os.WriteFile(meta, + if err := os.WriteFile(f.meta, []byte(`{"schemaVersion":1,"seed_sha256":"sha256:fake"}`), 0o644); err != nil { t.Fatal(err) } - stdout, stderr, exit := runSeedCheck(t, mount, seed, meta) + stdout, stderr, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: f.seed, meta: f.meta, forceMount: true, + }) if exit != 0 { t.Fatalf("exit: got %d, want 0; stdout=%q stderr=%q", exit, stdout, stderr) } - body, err := os.ReadFile(filepath.Join(mount, "workload-data/db.txt")) + body, err := os.ReadFile(filepath.Join(f.mount, "workload-data/db.txt")) if err != nil { t.Fatalf("seed file should be extracted: %v", err) } if string(body) != "schema=v0.4.0" { t.Errorf("extracted body: got %q, want schema=v0.4.0", body) } - markerBody, err := os.ReadFile(filepath.Join(mount, ".y-cluster-seeded")) + markerBody, err := os.ReadFile(filepath.Join(f.mount, ".y-cluster-seeded")) if err != nil { t.Fatalf("marker should be written: %v", err) } if !strings.Contains(string(markerBody), "seed_sha256") { t.Errorf("marker should contain seed metadata: %s", markerBody) } + // Bypass-sentinel must NOT exist in the production-mount path. + if _, err := os.Stat(filepath.Join(f.mount, ".y-cluster-seeded-via-bypass")); err == nil { + t.Errorf("bypass sentinel should not exist on a mounted-volume seed") + } } -// TestSeedCheck_NonEmptyNoMarker_Conflict pins the safety belt: -// customer drive has unrelated data, no marker -> we refuse and -// exit non-zero so the k3s drop-in blocks startup. +// State 2: volume attached, has unmarked data -> conflict, no seed. func TestSeedCheck_NonEmptyNoMarker_Conflict(t *testing.T) { - dir := t.TempDir() - mount := filepath.Join(dir, "mount") - if err := os.MkdirAll(mount, 0o755); err != nil { - t.Fatal(err) - } - if err := os.WriteFile(filepath.Join(mount, "customer-stuff.txt"), + f := newFixture(t) + if err := os.WriteFile(filepath.Join(f.mount, "customer-stuff.txt"), []byte("not ours"), 0o644); err != nil { t.Fatal(err) } - - seed := filepath.Join(dir, "seed.tar.zst") - meta := filepath.Join(dir, "seed.meta.json") - makeSeedTar(t, seed, map[string]string{"x": "y"}) - if err := os.WriteFile(meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { + makeSeedTar(t, f.seed, map[string]string{"x": "y"}) + if err := os.WriteFile(f.meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { t.Fatal(err) } - stdout, stderr, exit := runSeedCheck(t, mount, seed, meta) + stdout, stderr, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: f.seed, meta: f.meta, forceMount: true, + }) if exit == 0 { t.Errorf("exit: got 0, want non-zero (conflict); stdout=%q", stdout) } @@ -212,33 +212,127 @@ func TestSeedCheck_NonEmptyNoMarker_Conflict(t *testing.T) { if !strings.Contains(stderr, "Resolution") { t.Errorf("stderr should include recovery recipes: %s", stderr) } - // Customer file must be untouched. - body, _ := os.ReadFile(filepath.Join(mount, "customer-stuff.txt")) + body, _ := os.ReadFile(filepath.Join(f.mount, "customer-stuff.txt")) if string(body) != "not ours" { t.Errorf("customer file mutated: %q", body) } - // Marker must NOT have been written. - if _, err := os.Stat(filepath.Join(mount, ".y-cluster-seeded")); err == nil { + if _, err := os.Stat(filepath.Join(f.mount, ".y-cluster-seeded")); err == nil { t.Errorf("marker should not exist after conflict") } } -// TestSeedCheck_LostFoundIgnored covers the freshly-formatted ext4 -// case: lost+found exists but isn't customer data. +// State 3: no volume, no bypass -> production gate fails closed. +// This is the regression posture for the customer-mounts-after-k3s +// race we hit on the GCP appliance. +func TestSeedCheck_NotMounted_NoBypass_Fails(t *testing.T) { + f := newFixture(t) + makeSeedTar(t, f.seed, map[string]string{"x": "y"}) + if err := os.WriteFile(f.meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { + t.Fatal(err) + } + + stdout, stderr, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: f.seed, meta: f.meta, + // forceMount: false -- the tmp dir really isn't a mountpoint. + }) + if exit == 0 { + t.Fatalf("exit: got 0, want non-zero (mount required); stdout=%q stderr=%q", stdout, stderr) + } + if !strings.Contains(stderr, "not a mountpoint") { + t.Errorf("stderr should mention missing mountpoint: %s", stderr) + } + if !strings.Contains(stderr, "LABEL=y-cluster-data") { + t.Errorf("stderr should reference the LABEL fstab convention: %s", stderr) + } + if _, err := os.Stat(filepath.Join(f.mount, ".y-cluster-seeded")); err == nil { + t.Errorf("marker should not exist when mount-required gate fires") + } +} + +// State 4: no volume + bypass flag -> extract regardless of mount, +// drop sibling sentinel marking the bypass. +func TestSeedCheck_BypassFlag_Extracts(t *testing.T) { + f := newFixture(t) + bypass := filepath.Join(f.dir, "bypass-flag") + if err := os.WriteFile(bypass, nil, 0o644); err != nil { + t.Fatal(err) + } + makeSeedTar(t, f.seed, map[string]string{"hello.txt": "world"}) + if err := os.WriteFile(f.meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { + t.Fatal(err) + } + + stdout, stderr, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: f.seed, meta: f.meta, bypass: bypass, + // forceMount: false on purpose -- the bypass branch must + // short-circuit the mount-required gate. + }) + if exit != 0 { + t.Fatalf("exit: got %d, want 0; stdout=%q stderr=%q", exit, stdout, stderr) + } + if !strings.Contains(stdout, "bypass flag") { + t.Errorf("stdout should announce bypass: %s", stdout) + } + body, err := os.ReadFile(filepath.Join(f.mount, "hello.txt")) + if err != nil { + t.Fatalf("seed should have been extracted in bypass mode: %v", err) + } + if string(body) != "world" { + t.Errorf("extracted body: got %q, want world", body) + } + if _, err := os.Stat(filepath.Join(f.mount, ".y-cluster-seeded")); err != nil { + t.Errorf("marker should be written even in bypass mode: %v", err) + } + if _, err := os.Stat(filepath.Join(f.mount, ".y-cluster-seeded-via-bypass")); err != nil { + t.Errorf("bypass sentinel should be present: %v", err) + } +} + +// State 5: marker present -> upgrade fast path, no-op. +func TestSeedCheck_MarkerPresent_NoOp(t *testing.T) { + f := newFixture(t) + if err := os.WriteFile(filepath.Join(f.mount, ".y-cluster-seeded"), + []byte(`{"schemaVersion":1,"existing":"marker"}`), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(f.mount, "existing-data.txt"), + []byte("customer's data"), 0o644); err != nil { + t.Fatal(err) + } + + stdout, _, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: "/nonexistent/seed", meta: "/nonexistent/meta", + forceMount: true, + }) + if exit != 0 { + t.Errorf("exit: got %d, want 0; stdout=%q", exit, stdout) + } + if !strings.Contains(stdout, "marker present") { + t.Errorf("expected 'marker present' in stdout, got: %s", stdout) + } + body, _ := os.ReadFile(filepath.Join(f.mount, "existing-data.txt")) + if string(body) != "customer's data" { + t.Errorf("existing data mutated: %q", body) + } +} + +// State 6: lost+found ignored on freshly-formatted ext4. The kernel +// creates lost+found on every mkfs.ext4, so a "fresh empty" volume +// is actually never empty; the script must treat lost+found as +// non-content. func TestSeedCheck_LostFoundIgnored(t *testing.T) { - dir := t.TempDir() - mount := filepath.Join(dir, "mount") - if err := os.MkdirAll(filepath.Join(mount, "lost+found"), 0o755); err != nil { + f := newFixture(t) + if err := os.MkdirAll(filepath.Join(f.mount, "lost+found"), 0o755); err != nil { t.Fatal(err) } - seed := filepath.Join(dir, "seed.tar.zst") - meta := filepath.Join(dir, "seed.meta.json") - makeSeedTar(t, seed, map[string]string{"hello.txt": "world"}) - if err := os.WriteFile(meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { + makeSeedTar(t, f.seed, map[string]string{"hello.txt": "world"}) + if err := os.WriteFile(f.meta, []byte(`{"schemaVersion":1}`), 0o644); err != nil { t.Fatal(err) } - _, _, exit := runSeedCheck(t, mount, seed, meta) + _, _, exit := runSeedCheck(t, seedCheckOpts{ + mount: f.mount, seed: f.seed, meta: f.meta, forceMount: true, + }) if exit != 0 { t.Errorf("lost+found should be ignored; exit=%d", exit) } diff --git a/pkg/provision/qemu/k3s_data_seed.conf b/pkg/provision/qemu/k3s_data_seed.conf index 2f6efa6..528592d 100644 --- a/pkg/provision/qemu/k3s_data_seed.conf +++ b/pkg/provision/qemu/k3s_data_seed.conf @@ -3,6 +3,6 @@ # mode -- /data/yolean has unmarked customer data) k3s does NOT # start; the customer SSHes in, runs y-cluster-seed-status, applies # the recommended recovery, then `systemctl start k3s.service`. -# See specs/y-cluster/APPLIANCE_UPGRADES.md. +# See APPLIANCE_MAINTENANCE.md. Requires=y-cluster-data-seed.service After=y-cluster-data-seed.service diff --git a/pkg/provision/qemu/lifecycle.go b/pkg/provision/qemu/lifecycle.go index 0557eb8..01f4498 100644 --- a/pkg/provision/qemu/lifecycle.go +++ b/pkg/provision/qemu/lifecycle.go @@ -154,6 +154,45 @@ func guestPoweroff(cacheDir, name string, pid int, logger *zap.Logger) error { // then re-imports the kubeconfig so the host-side context is // fresh even if it was cleaned while the cluster was down. func Start(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { + c, err := startVMReady(ctx, cacheDir, name, logger) + if err != nil { + return nil, err + } + if logger == nil { + logger = zap.NewNop() + } + logger.Info("VM up; waiting for k3s") + if err := c.waitForK3sReady(ctx); err != nil { + return nil, fmt.Errorf("wait for k3s: %w", err) + } + + rawKubeconfig, err := c.extractKubeconfig(ctx) + if err != nil { + return nil, fmt.Errorf("extract kubeconfig: %w", err) + } + if err := c.Kubeconfig.Import(rawKubeconfig); err != nil { + return nil, fmt.Errorf("merge kubeconfig: %w", err) + } + logger.Info("k3s ready", zap.String("context", c.cfg.Context)) + return c, nil +} + +// StartForDiagnostic boots the VM from saved state and waits for SSH +// but does NOT wait for k3s readiness. Used when the appliance is in +// a state that intentionally blocks k3s (e.g., the data-seed unit +// failed because the customer hasn't attached their /data/yolean +// volume) and the operator -- or a test -- needs to SSH in to +// inspect the journal and recover. The returned *Cluster has SSH / +// SCP wired up; Kubeconfig is initialised but no kubeconfig has +// been imported. +func StartForDiagnostic(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { + return startVMReady(ctx, cacheDir, name, logger) +} + +// startVMReady is the prefix shared by Start and StartForDiagnostic: +// load state, boot the VM, wait for SSH. Anything that requires k3s +// to be up belongs in Start, not here. +func startVMReady(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { if logger == nil { logger = zap.NewNop() } @@ -193,19 +232,6 @@ func Start(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Clu if err := c.waitForSSH(ctx); err != nil { return nil, fmt.Errorf("wait for SSH: %w", err) } - logger.Info("VM up; waiting for k3s") - if err := c.waitForK3sReady(ctx); err != nil { - return nil, fmt.Errorf("wait for k3s: %w", err) - } - - rawKubeconfig, err := c.extractKubeconfig(ctx) - if err != nil { - return nil, fmt.Errorf("extract kubeconfig: %w", err) - } - if err := kubecfg.Import(rawKubeconfig); err != nil { - return nil, fmt.Errorf("merge kubeconfig: %w", err) - } - logger.Info("k3s ready", zap.String("context", cfg.Context)) return c, nil } diff --git a/pkg/provision/qemu/prepare_export_test.go b/pkg/provision/qemu/prepare_export_test.go index 23bbde7..8b987e4 100644 --- a/pkg/provision/qemu/prepare_export_test.go +++ b/pkg/provision/qemu/prepare_export_test.go @@ -90,6 +90,30 @@ func TestPrepareInguestScript_CloudInitClean(t *testing.T) { } } +// TestPrepareInguestScript_FstabLabel pins the fstab pre-bake. +// LABEL-based mounting is the cross-hypervisor universal so the +// customer doesn't have to edit fstab on VMware / VirtualBox / +// Hetzner / GCP -- they just attach an ext4 volume labeled +// y-cluster-data. nofail keeps boot moving when the volume isn't +// attached; data_seed_check.sh's mount-required gate then surfaces +// the actionable failure. The seed-check resolution text refers +// to this same LABEL, so the two must stay in sync. +func TestPrepareInguestScript_FstabLabel(t *testing.T) { + body := PrepareInguestScript() + for _, want := range []string{ + `LABEL=y-cluster-data`, + `/data/yolean ext4`, + `nofail`, + } { + if !strings.Contains(body, want) { + t.Errorf("prepare-inguest script missing %q:\n%s", want, body) + } + } + if !strings.Contains(body, "grep -q 'LABEL=y-cluster-data' /etc/fstab") { + t.Errorf("fstab pre-bake must be idempotency-guarded by a grep:\n%s", body) + } +} + // TestPrepareInguestScript_DisablesCloudInitNetworkConfig pins // the cfg drop that prevents cloud-init from regenerating // /etc/netplan/50-cloud-init.yaml on the imported VM's first diff --git a/pkg/provision/qemu/prepare_inguest.sh b/pkg/provision/qemu/prepare_inguest.sh index 77ab1dd..c7be6a7 100644 --- a/pkg/provision/qemu/prepare_inguest.sh +++ b/pkg/provision/qemu/prepare_inguest.sh @@ -88,7 +88,39 @@ rm -f /var/lib/systemd/random-seed rm -f /var/lib/dhcp/dhclient.leases apt-get clean -# 5. Enable wall-clock sync at first boot. Without this, an +# 5a. Clear /data/yolean on the boot disk now that BuildSeedAssets +# (host-side, BEFORE virt-customize started) has snapshotted its +# content into /var/lib/y-cluster/data-seed.tar.zst. Two reasons: +# +# - Production: the customer's persistent volume mounts at +# /data/yolean on first boot and shadows the boot-disk dir, so +# the bytes here are dead weight. fstrim later reclaims the +# freed blocks; the appliance image ships smaller. +# - Bypass (Hetzner QA): no labeled volume attached, so the boot +# disk's /data/yolean IS the seed target. data_seed_check.sh's +# conflict-detection branch refuses to overwrite unmarked files; +# clearing here means the bypass extract goes into an empty dir +# and the marker writes cleanly. +# +# The dir itself is preserved (recreated) so the fstab mount has a +# mountpoint to attach to and seed-check has somewhere to extract +# into. +mkdir -p /data/yolean +rm -rf /data/yolean/* /data/yolean/.[!.]* 2>/dev/null || true + +# 5c. Pre-bake the customer's persistent /data/yolean fstab entry. +# The customer attaches an ext4 volume labeled "y-cluster-data" to +# the imported VM; cloud-agnostic LABEL= mounting means VMware / +# VirtualBox / Hetzner / GCP all recognise the same volume without +# the customer editing fstab themselves. nofail keeps boot moving +# even if the volume isn't attached -- y-cluster-data-seed.service +# fails closed in that case and surfaces the actionable error. +# Idempotent: re-running prepare-export doesn't dupe the entry. +if ! grep -q 'LABEL=y-cluster-data' /etc/fstab; then + echo 'LABEL=y-cluster-data /data/yolean ext4 defaults,nofail 0 2' >> /etc/fstab +fi + +# 5d. Enable wall-clock sync at first boot. Without this, an # imported VM whose RTC was set by the host clock can boot # minutes-to-hours away from real UTC, and k3s's TLS certs # (NotBefore = build time) read as "not yet valid", which @@ -112,7 +144,7 @@ systemctl enable systemd-timesyncd.service # appliance therefore runs every staged manifest against THEIR # cluster (e.g., migration Jobs). # -# See specs/y-cluster/APPLIANCE_UPGRADES.md. +# See APPLIANCE_MAINTENANCE.md. if [ -d /var/lib/y-cluster/manifests-staging ] \ && [ "$(ls -A /var/lib/y-cluster/manifests-staging 2>/dev/null)" ]; then mkdir -p /var/lib/rancher/k3s/server/manifests From cba461e4ed42d0b835412a85327932016bf20708 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 09:42:57 +0000 Subject: [PATCH 2/7] test(e2e/qemu): align local gateway access on port 80 --- e2e/qemu_test.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/e2e/qemu_test.go b/e2e/qemu_test.go index 59006a0..4323b2e 100644 --- a/e2e/qemu_test.go +++ b/e2e/qemu_test.go @@ -65,11 +65,19 @@ func e2eQEMURuntime() qemu.Config { } // e2eUniqueForwards builds a port-forward list that won't collide -// with another e2e test running on the same machine. Required since -// Provision now installs k3s and needs a forward to guest 6443 to -// extract a working kubeconfig. -func e2eUniqueForwards(apiPort string) []qemu.PortForward { - return []qemu.PortForward{{Host: apiPort, Guest: "6443"}} +// with another e2e test running on the same machine. Two forwards: +// +// - apiPort -> guest 6443: required for Provision to extract a +// working kubeconfig from the booted VM's k3s API. +// - httpPort -> guest 80: required so any setup script that pokes +// the gateway's HTTP listener (e.g. `curl 127.0.0.1:/...` +// against an HTTPRoute / GRPCRoute the test installs) reaches +// the VM. Several Yolean dev scripts assume this forward exists. +func e2eUniqueForwards(apiPort, httpPort string) []qemu.PortForward { + return []qemu.PortForward{ + {Host: apiPort, Guest: "6443"}, + {Host: httpPort, Guest: "80"}, + } } func TestQemu_ProvisionTeardown(t *testing.T) { @@ -88,7 +96,7 @@ func TestQemu_ProvisionTeardown(t *testing.T) { cfg.Memory = "4096" cfg.CPUs = "2" cfg.SSHPort = "2223" // avoid conflict with real cluster on 2222 - cfg.PortForwards = e2eUniqueForwards("26443") + cfg.PortForwards = e2eUniqueForwards("26443", "28443") cfg.Kubeconfig = os.Getenv("KUBECONFIG") if cfg.Kubeconfig == "" { t.Skip("KUBECONFIG must be set") @@ -186,7 +194,7 @@ func TestQemu_TeardownKeepDisk(t *testing.T) { cfg.Memory = "4096" cfg.CPUs = "2" cfg.SSHPort = "2225" - cfg.PortForwards = e2eUniqueForwards("26444") + cfg.PortForwards = e2eUniqueForwards("26444", "28444") cfg.Kubeconfig = os.Getenv("KUBECONFIG") if cfg.Kubeconfig == "" { t.Skip("KUBECONFIG must be set") @@ -223,7 +231,7 @@ func TestQemu_ExportImport(t *testing.T) { cfg.Memory = "4096" cfg.CPUs = "2" cfg.SSHPort = "2224" - cfg.PortForwards = e2eUniqueForwards("26445") + cfg.PortForwards = e2eUniqueForwards("26445", "28445") cfg.Kubeconfig = os.Getenv("KUBECONFIG") if cfg.Kubeconfig == "" { t.Skip("KUBECONFIG must be set") @@ -313,7 +321,7 @@ func TestQemu_StopStart(t *testing.T) { cfg.Memory = "4096" cfg.CPUs = "2" cfg.SSHPort = "2226" - cfg.PortForwards = e2eUniqueForwards("26446") + cfg.PortForwards = e2eUniqueForwards("26446", "28446") cfg.Kubeconfig = os.Getenv("KUBECONFIG") if cfg.Kubeconfig == "" { t.Skip("KUBECONFIG must be set") @@ -412,7 +420,7 @@ func TestQemu_Seed_GateAndBypass(t *testing.T) { cfg.Memory = "4096" cfg.CPUs = "2" cfg.SSHPort = "2227" - cfg.PortForwards = e2eUniqueForwards("26447") + cfg.PortForwards = e2eUniqueForwards("26447", "28447") cfg.Kubeconfig = os.Getenv("KUBECONFIG") if cfg.Kubeconfig == "" { t.Skip("KUBECONFIG must be set") From 8a878bf35fe10b8d98ed48c22aabf820414b318e Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Thu, 7 May 2026 09:43:03 +0000 Subject: [PATCH 3/7] test(e2e/qemu): bump appliance VM/disk size to 40G (avoid disk-pressure flakes) --- e2e/qemu_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/e2e/qemu_test.go b/e2e/qemu_test.go index 4323b2e..5c04473 100644 --- a/e2e/qemu_test.go +++ b/e2e/qemu_test.go @@ -58,9 +58,18 @@ func assertPidGone(t *testing.T, pid int) { // defaulted config.QEMUConfig. Tests then override individual fields // to keep ports / cache dirs / contexts isolated from a developer's // real cluster on the same host. +// +// DiskSize is bumped from the 20G default to 40G: appliance e2e +// flows install workloads, build a seed tarball, prepare-export, +// and re-boot from the prepared disk -- the cumulative footprint +// pushes the 20G disk into pressure on the kubelet's image-gc +// thresholds, which surfaces as flaky pod evictions mid-test. +// 40G is well clear of that ceiling and the qcow2 sparse layout +// means the host-disk footprint only grows with actual usage. func e2eQEMURuntime() qemu.Config { c := &config.QEMUConfig{CommonConfig: config.CommonConfig{Provider: config.ProviderQEMU}} c.ApplyDefaults() + c.DiskSize = "40G" return qemu.FromConfig(c) } From a81c141c0f4630f35a0d744127b363eb0884001a Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 03:53:48 +0000 Subject: [PATCH 4/7] feat(yconverge): kind: "gateway" first-class HTTP check Adds a typed gateway check alongside wait / rollout / exec. Solves the false-positive class that motivated specs/y-cluster/ FEATURE_REQUEST_HTTP_CHECK.md: an `exec curl ... | grep 302` silently passes against the wrong cluster when /etc/hosts / upstream DNS resolves the configured hostname to a remote IP that also happens to redirect. Why "gateway" and not "http" The check's value comes from auto-discovering the local Gateway's programmed address (Gateway.status.addresses) and pinning curl's dial target via --resolve, so the request actually traverses Gateway -> HTTPRoute -> backend instead of whatever the host's resolver thinks the hostname means. Calling that "http" would invite authors to expect a generic curl wrapper. Naming it "gateway" makes the contract explicit and lets the runtime stay opinionated. How it runs - discoverGatewayAddress shells out to `kubectl get gateway -A -o json`, filters by gatewayClassName (or accepts any if empty), picks the first programmed Status.Addresses[].Value. - runGatewayProbe launches `kubectl run --rm --restart=Never --image=curlimages/curl:8.10.1` with curl args: --resolve :: -H "Host: " -w 'HTTP_CODE:%{http_code}\nLOCATION:%{redirect_url}\n' The Pod runs in the cluster's network so cluster DNS / Service routing reflect the real consumer-side path. --resolve forces the dial target so the probe can't accidentally hit prod even if upstream DNS resolves the hostname to a public IP. - parseGatewayProbeOutput + validateGatewayProbeResult are pure functions on the curl -w output; unit tests cover them without spinning up kubectl. Cue surface #Gateway lands as the fourth variant of #Check: kind: "gateway" url: string expectCode: *[200] | [...int] expectLocation?: string resolve?: string gatewayClassName?: string timeout: *"60s" | string description: *"" | string expectCode is always a list -- single-status callers write [302]. expectLocation is a Go regexp matched against the response Location header (the canonical use case: pin the oauth2 redirect realm). Coverage - parseGatewayProbeOutput: happy path, missing code, malformed, no-Location. - validateGatewayProbeResult: default code, list match, regex match, regex compile failure. - splitURLHostPort: http/https default ports, explicit port, IPv6, missing host, unsupported scheme. - pickGatewayAddress: class-narrowed, any-class, no-match, none-programmed. - ParseChecks_GatewayCheck: cue -> Go round-trip with the canonical 302 + Location-regex shape. - ParseChecks_GatewayCheck_DefaultCode: omitted expectCode yields [200]. Deferred to follow-ups - method / headers / expectBody / tlsInsecure (canonical redirect reproducer doesn't need them). - e2e against a real cluster (would need an envoy-gateway lane in the e2e suite). - Migrating existing exec-curl-grep checks in checkit / cluster-local to kind: "gateway". Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/yconverge/checks.go | 63 ++++ pkg/yconverge/cue_test.go | 88 +++++- pkg/yconverge/gateway.go | 275 ++++++++++++++++++ pkg/yconverge/gateway_test.go | 245 ++++++++++++++++ .../ystack/yconverge/verify/schema.cue | 37 ++- 5 files changed, 705 insertions(+), 3 deletions(-) create mode 100644 pkg/yconverge/gateway.go create mode 100644 pkg/yconverge/gateway_test.go diff --git a/pkg/yconverge/checks.go b/pkg/yconverge/checks.go index 2b82eab..187c34f 100644 --- a/pkg/yconverge/checks.go +++ b/pkg/yconverge/checks.go @@ -14,6 +14,11 @@ import ( ) // Check represents a single post-apply verification step. +// +// Fields are a flat union: each kind reads only the subset it +// understands. The CUE schema (cue.mod/.../verify/schema.cue) +// enforces which fields belong to which kind via #Wait / +// #Rollout / #Exec / #Gateway. type Check struct { Kind string `json:"kind"` Resource string `json:"resource,omitempty"` @@ -22,6 +27,15 @@ type Check struct { Timeout string `json:"timeout,omitempty"` Command string `json:"command,omitempty"` Description string `json:"description,omitempty"` + + // Gateway-only fields. URL is required; everything else has a + // documented default. See pkg/yconverge/gateway.go for the + // dial / discovery / validation semantics. + URL string `json:"url,omitempty"` + ExpectCode []int `json:"expectCode,omitempty"` + ExpectLocation string `json:"expectLocation,omitempty"` + Resolve string `json:"resolve,omitempty"` + GatewayClassName string `json:"gatewayClassName,omitempty"` } // DefaultTimeout is used when a check does not specify a timeout. @@ -92,11 +106,60 @@ func (r *CheckRunner) runOne(ctx context.Context, check Check) error { return r.runRollout(ctx, check, ns, timeout) case "exec": return r.runExec(ctx, check, timeout) + case "gateway": + return r.runGateway(ctx, check, timeout) default: return fmt.Errorf("unknown check kind: %q", check.Kind) } } +// runGateway executes a `kind: "gateway"` check: discover the +// Gateway address, launch an in-cluster curl Pod with --resolve +// pinned to that address, validate the response code and (when +// configured) Location header. Retries on failure until timeout +// using the same 2s interval as runExec, since the common +// transient failure modes (Gateway not yet programmed, HTTPRoute +// not yet reconciled, backend not yet Ready) all resolve in +// seconds. +func (r *CheckRunner) runGateway(ctx context.Context, check Check, timeout time.Duration) error { + if check.URL == "" { + return fmt.Errorf("gateway check: url is required") + } + desc := check.Description + if desc == "" { + desc = fmt.Sprintf("gateway %s", check.URL) + } + r.Logger.Debug("check", + zap.String("kind", "gateway"), + zap.String("url", check.URL), + zap.String("description", desc), + ) + opts := gatewayProbeOpts{ + URL: check.URL, + ExpectCodes: check.ExpectCode, + ExpectLocation: check.ExpectLocation, + Resolve: check.Resolve, + GatewayClassName: check.GatewayClassName, + } + deadline := time.Now().Add(timeout) + var lastErr error + for { + if err := runGatewayProbe(ctx, r.Context, opts); err == nil { + return nil + } else { + lastErr = err + } + if time.Now().After(deadline) { + return fmt.Errorf("gateway check timed out after %s: %w", timeout, lastErr) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(2 * time.Second): + } + } +} + func (r *CheckRunner) runWait(ctx context.Context, check Check, ns string, timeout time.Duration) error { desc := check.Description if desc == "" { diff --git a/pkg/yconverge/cue_test.go b/pkg/yconverge/cue_test.go index 42e1959..e08b0c1 100644 --- a/pkg/yconverge/cue_test.go +++ b/pkg/yconverge/cue_test.go @@ -123,7 +123,7 @@ language: version: "v0.16.0" checks: [...#Check] } -#Check: #Wait | #Rollout | #Exec +#Check: #Wait | #Rollout | #Exec | #Gateway #Wait: { kind: "wait" @@ -148,6 +148,17 @@ language: version: "v0.16.0" timeout: *"60s" | string description: string } + +#Gateway: { + kind: "gateway" + url: string + expectCode: *[200] | [...int] + expectLocation?: string + resolve?: string + gatewayClassName?: string + timeout: *"60s" | string + description: *"" | string +} `) return root } @@ -248,6 +259,81 @@ step: verify.#Step & { } } +// TestParseChecks_GatewayCheck pins the cue->Go round-trip for the +// canonical gateway-check shape: 302 + Location regex pinning the +// oauth redirect target. Verifies the new fields (URL, ExpectCode, +// ExpectLocation) survive the JSON marshal in ParseChecks. +func TestParseChecks_GatewayCheck(t *testing.T) { + root := setupCueModule(t) + writeFile(t, filepath.Join(root, "base/yconverge.cue"), `package base + +import "yolean.se/ystack/yconverge/verify" + +step: verify.#Step & { + checks: [{ + kind: "gateway" + url: "http://dev.yolean.net/" + expectCode: [302] + expectLocation: "^http://dev\\.yolean\\.net/auth/realms/[^/]+/.*" + timeout: "120s" + description: "dev.yolean.net unauth -> oauth2 redirect" + }] +} +`) + checks, err := ParseChecks(filepath.Join(root, "base")) + if err != nil { + t.Fatal(err) + } + if len(checks) != 1 { + t.Fatalf("expected 1 check, got %d", len(checks)) + } + c := checks[0] + if c.Kind != "gateway" { + t.Errorf("Kind: %q, want gateway", c.Kind) + } + if c.URL != "http://dev.yolean.net/" { + t.Errorf("URL: %q", c.URL) + } + if len(c.ExpectCode) != 1 || c.ExpectCode[0] != 302 { + t.Errorf("ExpectCode: %v, want [302]", c.ExpectCode) + } + if c.ExpectLocation != `^http://dev\.yolean\.net/auth/realms/[^/]+/.*` { + t.Errorf("ExpectLocation: %q", c.ExpectLocation) + } + if c.Timeout != "120s" { + t.Errorf("Timeout: %q", c.Timeout) + } +} + +// TestParseChecks_GatewayCheck_DefaultCode pins the schema's +// expectCode default: omitting the field yields [200] in Go. +// Authors writing a 200-only probe don't need to specify the code +// explicitly. +func TestParseChecks_GatewayCheck_DefaultCode(t *testing.T) { + root := setupCueModule(t) + writeFile(t, filepath.Join(root, "base/yconverge.cue"), `package base + +import "yolean.se/ystack/yconverge/verify" + +step: verify.#Step & { + checks: [{ + kind: "gateway" + url: "http://echo.example/health" + }] +} +`) + checks, err := ParseChecks(filepath.Join(root, "base")) + if err != nil { + t.Fatal(err) + } + if len(checks) != 1 { + t.Fatalf("expected 1 check, got %d", len(checks)) + } + if len(checks[0].ExpectCode) != 1 || checks[0].ExpectCode[0] != 200 { + t.Errorf("ExpectCode default: %v, want [200]", checks[0].ExpectCode) + } +} + func TestParseChecks_MultipleChecks(t *testing.T) { root := setupCueModule(t) writeFile(t, filepath.Join(root, "base/yconverge.cue"), `package base diff --git a/pkg/yconverge/gateway.go b/pkg/yconverge/gateway.go new file mode 100644 index 0000000..1f09528 --- /dev/null +++ b/pkg/yconverge/gateway.go @@ -0,0 +1,275 @@ +package yconverge + +import ( + "context" + "encoding/json" + "fmt" + "net/url" + "os/exec" + "regexp" + "strconv" + "strings" + "time" +) + +// curlImage is the ephemeral probe Pod's image. Pinned by tag for +// reproducibility; can be overridden via opts.Image (used by tests +// to point at a local image and skip the pull). +const curlImage = "curlimages/curl:8.10.1" + +// gatewayProbeOpts captures the runtime shape of a `kind: "gateway"` +// check. Pulled out as a value type so the validation half can be +// unit-tested without shelling out to kubectl. +type gatewayProbeOpts struct { + // URL is the request target including scheme, host, optional + // port, and path. Host header for the request is derived from + // URL.Host so an HTTPRoute matching by Host actually fires. + URL string + // ExpectCodes, if non-empty, lists the response status codes + // that pass the check. Empty defaults to {200}. + ExpectCodes []int + // ExpectLocation, if non-empty, is a Go regexp that must match + // the Location response header. Pairs with 3xx ExpectCodes; + // silently passes against responses with no Location. + ExpectLocation string + // Resolve, if non-empty, is the dial target IP for the URL's + // host:port. Bypasses Gateway address discovery. + Resolve string + // GatewayClassName narrows discovery to Gateways of this class. + // Empty means: pick from the only Gateway present (errors if + // multiple distinct class names exist). + GatewayClassName string + // Image overrides curlImage; for tests. + Image string +} + +// gatewayProbeResult is the parsed curl response: just the bits we +// validate against. Body capture is deferred (cap + regex match +// would land here as a follow-up). +type gatewayProbeResult struct { + HTTPCode int + Location string +} + +// parseGatewayProbeOutput parses the `-w` template our curl +// invocation emits: +// +// HTTP_CODE: +// LOCATION: +// +// Returns an error if HTTP_CODE is missing -- a probe that didn't +// produce one is a probe that didn't reach the server, and the +// caller surfaces that as a probe failure. +func parseGatewayProbeOutput(s string) (*gatewayProbeResult, error) { + r := &gatewayProbeResult{} + seenCode := false + for _, line := range strings.Split(strings.TrimSpace(s), "\n") { + switch { + case strings.HasPrefix(line, "HTTP_CODE:"): + v := strings.TrimPrefix(line, "HTTP_CODE:") + n, err := strconv.Atoi(strings.TrimSpace(v)) + if err != nil { + return nil, fmt.Errorf("parse HTTP_CODE %q: %w", v, err) + } + r.HTTPCode = n + seenCode = true + case strings.HasPrefix(line, "LOCATION:"): + r.Location = strings.TrimPrefix(line, "LOCATION:") + } + } + if !seenCode { + return nil, fmt.Errorf("no HTTP_CODE in probe output:\n%s", s) + } + return r, nil +} + +// validateGatewayProbeResult applies opts' expectations to r. +// Returns nil on a fully-passing probe. +func validateGatewayProbeResult(opts gatewayProbeOpts, r *gatewayProbeResult) error { + codes := opts.ExpectCodes + if len(codes) == 0 { + codes = []int{200} + } + matched := false + for _, c := range codes { + if r.HTTPCode == c { + matched = true + break + } + } + if !matched { + return fmt.Errorf("expected status %v, got %d", codes, r.HTTPCode) + } + if opts.ExpectLocation != "" { + re, err := regexp.Compile(opts.ExpectLocation) + if err != nil { + return fmt.Errorf("invalid expectLocation regex %q: %w", opts.ExpectLocation, err) + } + if !re.MatchString(r.Location) { + return fmt.Errorf("expected Location to match %q, got %q", opts.ExpectLocation, r.Location) + } + } + return nil +} + +// splitURLHostPort extracts host + dial port from a URL. http +// defaults to 80, https to 443. Other schemes are an error since +// curl --resolve needs an explicit port. +func splitURLHostPort(rawURL string) (host, port string, err error) { + u, err := url.Parse(rawURL) + if err != nil { + return "", "", fmt.Errorf("parse url %q: %w", rawURL, err) + } + host = u.Hostname() + if host == "" { + return "", "", fmt.Errorf("url %q has no host", rawURL) + } + port = u.Port() + if port == "" { + switch u.Scheme { + case "http": + port = "80" + case "https": + port = "443" + default: + return "", "", fmt.Errorf("url %q has no port and unsupported scheme %q", rawURL, u.Scheme) + } + } + return host, port, nil +} + +// gatewayInfo is the slice of `kubectl get gateway -A -o json` +// output we care about: namespace + name (for diagnostics), the +// configured class, and the controller-reported addresses. +type gatewayInfo struct { + Metadata struct { + Name string `json:"name"` + Namespace string `json:"namespace"` + } `json:"metadata"` + Spec struct { + GatewayClassName string `json:"gatewayClassName"` + } `json:"spec"` + Status struct { + Addresses []struct { + Type string `json:"type"` + Value string `json:"value"` + } `json:"addresses"` + } `json:"status"` +} + +// pickGatewayAddress applies the className filter to a list of +// Gateway objects and returns the first programmed address. Pulled +// out as a pure function so the discovery wrapper stays a thin +// kubectl-out shellout that can be mocked in tests. +// +// Behaviour: +// +// className != "" -> only Gateways with that class match +// className == "" -> all Gateways are candidates +// no candidate has an +// address yet -> ("", nil) caller retries +// one or more candidates +// with addresses -> first non-empty Status.Addresses[i].Value +func pickGatewayAddress(items []gatewayInfo, className string) string { + for _, g := range items { + if className != "" && g.Spec.GatewayClassName != className { + continue + } + for _, a := range g.Status.Addresses { + if a.Value != "" { + return a.Value + } + } + } + return "" +} + +// discoverGatewayAddress walks the cluster's Gateways and returns +// the first programmed address matching opts.GatewayClassName (or +// the first programmed Gateway in any class when empty). Returns +// "" + nil error when no programmed Gateway exists yet -- the +// caller's retry-until-timeout loop catches that as transient. +func discoverGatewayAddress(ctx context.Context, contextName, className string) (string, error) { + cmd := exec.CommandContext(ctx, "kubectl", + "--context="+contextName, + "get", "gateway", "-A", + "-o", "json", + ) + out, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("kubectl get gateway: %w", err) + } + var list struct { + Items []gatewayInfo `json:"items"` + } + if err := json.Unmarshal(out, &list); err != nil { + return "", fmt.Errorf("parse gateway list: %w", err) + } + return pickGatewayAddress(list.Items, className), nil +} + +// runGatewayProbe is a single probe attempt: discover + dial + +// parse + validate. The retry-until-timeout shape lives in the +// caller (CheckRunner.runGateway) so the unit-testable surface +// here stays one round-trip. +func runGatewayProbe(ctx context.Context, contextName string, opts gatewayProbeOpts) error { + addr := opts.Resolve + if addr == "" { + var err error + addr, err = discoverGatewayAddress(ctx, contextName, opts.GatewayClassName) + if err != nil { + return fmt.Errorf("discover Gateway address: %w", err) + } + if addr == "" { + cn := opts.GatewayClassName + if cn == "" { + cn = "(any)" + } + return fmt.Errorf("no Gateway in class %s has a programmed address yet", cn) + } + } + host, port, err := splitURLHostPort(opts.URL) + if err != nil { + return err + } + + image := opts.Image + if image == "" { + image = curlImage + } + // Pod name uses ns-suffix random for collision-resistance under + // a fast-retry loop; --restart=Never + --rm tears the Pod down + // at exit. -- separates kubectl-run flags from the curl argv. + podName := fmt.Sprintf("yconverge-probe-%d", time.Now().UnixNano()) + curlArgs := []string{ + "-sS", "--max-time", "10", + "-o", "/dev/null", + "-w", "HTTP_CODE:%{http_code}\nLOCATION:%{redirect_url}\n", + "--resolve", host + ":" + port + ":" + addr, + "-H", "Host: " + host, + opts.URL, + } + args := append([]string{ + "--context=" + contextName, + "run", podName, + "--restart=Never", + "--rm", "-i", + "--image=" + image, + "--quiet", + "--command", "--", + "curl", + }, curlArgs...) + cmd := exec.CommandContext(ctx, "kubectl", args...) + var stdout, stderr strings.Builder + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("probe pod %s: %w (stdout: %q stderr: %q)", + podName, err, stdout.String(), stderr.String()) + } + result, err := parseGatewayProbeOutput(stdout.String()) + if err != nil { + return err + } + return validateGatewayProbeResult(opts, result) +} diff --git a/pkg/yconverge/gateway_test.go b/pkg/yconverge/gateway_test.go new file mode 100644 index 0000000..0c9437e --- /dev/null +++ b/pkg/yconverge/gateway_test.go @@ -0,0 +1,245 @@ +package yconverge + +import ( + "strings" + "testing" +) + +// TestParseGatewayProbeOutput_HappyPath pins the curl -w shape we +// emit. The probe code reads HTTP_CODE and LOCATION lines; any +// surrounding noise (kubectl run banners, etc.) is ignored. +func TestParseGatewayProbeOutput_HappyPath(t *testing.T) { + in := "HTTP_CODE:302\nLOCATION:http://dev.yolean.net/auth/realms/dev/openid?x=1\n" + got, err := parseGatewayProbeOutput(in) + if err != nil { + t.Fatalf("parse: %v", err) + } + if got.HTTPCode != 302 { + t.Errorf("HTTPCode: %d, want 302", got.HTTPCode) + } + if got.Location != "http://dev.yolean.net/auth/realms/dev/openid?x=1" { + t.Errorf("Location: %q", got.Location) + } +} + +// TestParseGatewayProbeOutput_NoLocation: a 200 has no Location. +// The probe must accept that without erroring. +func TestParseGatewayProbeOutput_NoLocation(t *testing.T) { + got, err := parseGatewayProbeOutput("HTTP_CODE:200\nLOCATION:\n") + if err != nil { + t.Fatalf("parse: %v", err) + } + if got.HTTPCode != 200 { + t.Errorf("HTTPCode: %d", got.HTTPCode) + } + if got.Location != "" { + t.Errorf("Location: %q (want empty)", got.Location) + } +} + +// TestParseGatewayProbeOutput_MissingCode: a probe that printed +// nothing (or no HTTP_CODE) fails the parse so the caller surfaces +// the failure as "probe didn't reach the server" rather than +// silently passing. +func TestParseGatewayProbeOutput_MissingCode(t *testing.T) { + if _, err := parseGatewayProbeOutput("LOCATION:somewhere\n"); err == nil { + t.Fatal("expected error for output without HTTP_CODE") + } + if _, err := parseGatewayProbeOutput(""); err == nil { + t.Fatal("expected error for empty output") + } +} + +// TestParseGatewayProbeOutput_MalformedCode catches the case where +// curl printed something non-numeric where the http_code goes. +func TestParseGatewayProbeOutput_MalformedCode(t *testing.T) { + _, err := parseGatewayProbeOutput("HTTP_CODE:not-a-number\nLOCATION:\n") + if err == nil { + t.Fatal("expected parse error") + } + if !strings.Contains(err.Error(), "HTTP_CODE") { + t.Errorf("error should mention HTTP_CODE: %v", err) + } +} + +// TestValidateGatewayProbeResult_DefaultCode: empty ExpectCodes +// defaults to {200}. +func TestValidateGatewayProbeResult_DefaultCode(t *testing.T) { + if err := validateGatewayProbeResult(gatewayProbeOpts{}, + &gatewayProbeResult{HTTPCode: 200}); err != nil { + t.Errorf("default 200 should pass: %v", err) + } + if err := validateGatewayProbeResult(gatewayProbeOpts{}, + &gatewayProbeResult{HTTPCode: 500}); err == nil { + t.Errorf("500 should fail default-200 validation") + } +} + +// TestValidateGatewayProbeResult_CodeList: any of the listed +// codes passes; outside the list fails. +func TestValidateGatewayProbeResult_CodeList(t *testing.T) { + opts := gatewayProbeOpts{ExpectCodes: []int{200, 204, 302}} + for _, code := range []int{200, 204, 302} { + if err := validateGatewayProbeResult(opts, + &gatewayProbeResult{HTTPCode: code}); err != nil { + t.Errorf("code %d should pass: %v", code, err) + } + } + if err := validateGatewayProbeResult(opts, + &gatewayProbeResult{HTTPCode: 301}); err == nil { + t.Error("301 should fail [200,204,302] validation") + } +} + +// TestValidateGatewayProbeResult_LocationRegex covers the canonical +// reproducer: 302 status + Location regex pinning the redirect +// target. This is the false-positive class kind: "exec" with +// `curl | grep 302` could not catch. +func TestValidateGatewayProbeResult_LocationRegex(t *testing.T) { + opts := gatewayProbeOpts{ + ExpectCodes: []int{302}, + ExpectLocation: `^http://dev\.yolean\.net/auth/realms/[^/]+/protocol/openid-connect/auth\?.*`, + } + good := &gatewayProbeResult{ + HTTPCode: 302, + Location: "http://dev.yolean.net/auth/realms/dev/protocol/openid-connect/auth?response_type=code", + } + if err := validateGatewayProbeResult(opts, good); err != nil { + t.Errorf("expected pass: %v", err) + } + wrongRealm := &gatewayProbeResult{ + HTTPCode: 302, + Location: "https://login.example.com/oauth/authorize?...", + } + if err := validateGatewayProbeResult(opts, wrongRealm); err == nil { + t.Error("expected Location regex failure for wrong-realm redirect") + } +} + +// TestValidateGatewayProbeResult_InvalidLocationRegex pins the +// "regex compile fails" failure mode -- author error in the cue +// file should surface a clear message. +func TestValidateGatewayProbeResult_InvalidLocationRegex(t *testing.T) { + opts := gatewayProbeOpts{ + ExpectCodes: []int{302}, + ExpectLocation: `[unclosed`, + } + err := validateGatewayProbeResult(opts, &gatewayProbeResult{HTTPCode: 302, Location: "x"}) + if err == nil { + t.Fatal("expected error for malformed regex") + } + if !strings.Contains(err.Error(), "expectLocation") { + t.Errorf("error should mention expectLocation: %v", err) + } +} + +// TestSplitURLHostPort covers the four cases the dial-target needs: +// scheme-defaulted ports, explicit ports, missing host, unsupported +// scheme. +func TestSplitURLHostPort(t *testing.T) { + cases := []struct { + in string + wantHost string + wantPort string + wantErr bool + }{ + {"http://dev.yolean.net/", "dev.yolean.net", "80", false}, + {"https://keycloak-admin/auth/", "keycloak-admin", "443", false}, + {"http://blobs:9000/", "blobs", "9000", false}, + {"http://[::1]:8080/x", "::1", "8080", false}, + // Errors: + {"://no-scheme", "", "", true}, + {"ftp://example/", "", "", true}, // unsupported scheme without port + } + for _, c := range cases { + host, port, err := splitURLHostPort(c.in) + if (err != nil) != c.wantErr { + t.Errorf("%q: err=%v wantErr=%v", c.in, err, c.wantErr) + continue + } + if c.wantErr { + continue + } + if host != c.wantHost || port != c.wantPort { + t.Errorf("%q: got %s:%s, want %s:%s", c.in, host, port, c.wantHost, c.wantPort) + } + } +} + +// TestPickGatewayAddress_ClassMatch: with className narrowed, +// only Gateways of that class contribute, and we pick the first +// programmed address. +func TestPickGatewayAddress_ClassMatch(t *testing.T) { + items := []gatewayInfo{ + {}, // no spec class, no addresses -- skipped + } + items[0].Spec.GatewayClassName = "other-class" + items[0].Status.Addresses = []struct { + Type string `json:"type"` + Value string `json:"value"` + }{{Type: "IPAddress", Value: "10.0.0.1"}} + + wanted := gatewayInfo{} + wanted.Spec.GatewayClassName = "y-cluster" + wanted.Status.Addresses = []struct { + Type string `json:"type"` + Value string `json:"value"` + }{{Type: "IPAddress", Value: "10.0.2.15"}} + items = append(items, wanted) + + got := pickGatewayAddress(items, "y-cluster") + if got != "10.0.2.15" { + t.Errorf("got %q, want 10.0.2.15 (matched-class)", got) + } +} + +// TestPickGatewayAddress_AnyClass: empty className -> first +// programmed address wins regardless of class. +func TestPickGatewayAddress_AnyClass(t *testing.T) { + items := []gatewayInfo{{}, {}} + items[0].Spec.GatewayClassName = "first-class" + items[0].Status.Addresses = []struct { + Type string `json:"type"` + Value string `json:"value"` + }{{Type: "IPAddress", Value: "1.1.1.1"}} + items[1].Spec.GatewayClassName = "second-class" + items[1].Status.Addresses = []struct { + Type string `json:"type"` + Value string `json:"value"` + }{{Type: "IPAddress", Value: "2.2.2.2"}} + + got := pickGatewayAddress(items, "") + if got != "1.1.1.1" { + t.Errorf("got %q, want 1.1.1.1 (first programmed)", got) + } +} + +// TestPickGatewayAddress_NoneProgrammed: no Gateway has a +// non-empty status.addresses[].value -> "" so the caller's retry +// loop knows to wait. +func TestPickGatewayAddress_NoneProgrammed(t *testing.T) { + items := []gatewayInfo{{}} + items[0].Spec.GatewayClassName = "y-cluster" + if got := pickGatewayAddress(items, "y-cluster"); got != "" { + t.Errorf("got %q, want empty (none programmed)", got) + } + if got := pickGatewayAddress(items, ""); got != "" { + t.Errorf("got %q, want empty (none programmed, any class)", got) + } +} + +// TestPickGatewayAddress_ClassNoMatch: className narrows past +// every Gateway -> "" (no match). Distinguishable from +// "none programmed" only via the kubectl get's exit code, which +// the discoverGatewayAddress wrapper handles. +func TestPickGatewayAddress_ClassNoMatch(t *testing.T) { + items := []gatewayInfo{{}} + items[0].Spec.GatewayClassName = "actual-class" + items[0].Status.Addresses = []struct { + Type string `json:"type"` + Value string `json:"value"` + }{{Type: "IPAddress", Value: "10.0.0.1"}} + if got := pickGatewayAddress(items, "wrong-class"); got != "" { + t.Errorf("got %q, want empty (no class match)", got) + } +} diff --git a/testdata/cue.mod/pkg/yolean.se/ystack/yconverge/verify/schema.cue b/testdata/cue.mod/pkg/yolean.se/ystack/yconverge/verify/schema.cue index 2005544..ccb03a6 100644 --- a/testdata/cue.mod/pkg/yolean.se/ystack/yconverge/verify/schema.cue +++ b/testdata/cue.mod/pkg/yolean.se/ystack/yconverge/verify/schema.cue @@ -10,8 +10,10 @@ package verify } // Check is a discriminated union. Each variant maps to a kubectl -// subcommand that manages its own timeout and output. -#Check: #Wait | #Rollout | #Exec +// subcommand that manages its own timeout and output, or (for +// kind: "gateway") to an in-cluster ephemeral curl probe with +// auto-discovered Gateway address pinning. +#Check: #Wait | #Rollout | #Exec | #Gateway // Thin wrapper around kubectl wait. // Timeout and output are managed by kubectl. @@ -42,3 +44,34 @@ package verify timeout: *"60s" | string description: string } + +// HTTP probe through the cluster's Gateway. The runtime discovers +// the Gateway's programmed address (Gateway.status.addresses) for +// the configured class, launches an ephemeral in-cluster curl Pod +// with `--resolve ::` so the request +// actually traverses Gateway -> HTTPRoute -> backend (no DNS or +// /etc/hosts dependency on the host running yconverge). The +// engine retries until timeout. +// +// expectCode is always a list -- single-status callers write +// `expectCode: [302]`. Empty defaults to `[200]` at runtime. +// +// expectLocation is a Go regexp matched against the response +// Location header; useful for asserting "redirected to oauth, on +// the right realm" without the curl-grep false-positives that +// kind: exec is prone to. +#Gateway: { + kind: "gateway" + url: string + expectCode: *[200] | [...int] + expectLocation?: string + // Optional explicit override of the Gateway-address discovery + // (curl --resolve target IP). Empty -> auto-discover from + // Gateway.status.addresses. + resolve?: string + // Optional GatewayClass narrowing. Empty -> first programmed + // Gateway across all classes. + gatewayClassName?: string + timeout: *"60s" | string + description: *"" | string +} From 8721ba1ee8c6cb149e50109943e8cc8dd4801f3b Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Tue, 5 May 2026 06:15:25 +0000 Subject: [PATCH 5/7] test(e2e/qemu): cover seed states 1 + 5 with a labeled data volume The original TestQemu_Seed_GateAndBypass exercised states 3 (gate fires when no volume is attached), 4 (bypass extracts), and 7 (sshd unaffected) -- the regression-posture half of the seed contract. The production happy path -- a labeled `y-cluster-data` volume attached at boot, the pre-baked LABEL fstab entry mounting it, and the upgrade fast path on the next boot -- was only covered manually via the GCP appliance run. The GCP test caught three real bugs (label mismatch, e2label idempotency, boot-disk-size constraint) that no other test caught, so the gap was real -- but GCP costs money + minutes per iteration. TestQemu_Seed_VolumeAttached closes that gap deterministically and locally: * Provision + plant sentinel + stop + prepare-export (~2 min, same shape as the existing TestQemu_Seed_GateAndBypass setup). * Build a labeled ext4 qcow2 via `qemu-img create` + `virt-format --filesystem=ext4 --label=y-cluster-data`. * Boot 1 (state 1): StartForDiagnosticWithDisks attaches the labeled disk, fstab mounts it, seed unit sees mountpoint + only lost+found + extracts. Assert /data/yolean is a mountpoint, sentinel restored, marker present, NO bypass sentinel, k3s reaches Ready via Requires=. * Boot 2 (state 5): same disk, marker now present from boot 1. Assert seed unit reaches active via the marker-respect path (journal mentions "marker present", does NOT mention "extracting"), sentinel unchanged, k3s reaches Ready. Supporting plumbing: * `qemu.StartForDiagnosticWithDisks(ctx, cacheDir, name, extraDisks, logger)` -- the diagnostic boot path with extra qcow2/raw drives appended after the boot+seed disks. The qemu provisioner deliberately doesn't manage data disks in production (the appliance contract is "customer attaches a labeled volume"), so this stays test-facing for now; the StartForDiagnostic non-Disks variant is unchanged. * `Cluster.extraDisks` is the unexported field threaded into `startVM`; production calls leave it nil. * Test helpers `makeLabeledDataDisk` (qemu-img + virt-format) and `waitForK3sReady` (in-VM kubectl polling, since diagnostic boots don't import the kubeconfig host-side). Runtime: ~3 min on a tuned local box, ~150 MB extra disk file in t.TempDir() (cleaned up at test exit). Skip-on-prereq for /dev/kvm, virt-customize, virt-format. Now states 1, 3, 4, 5, 7 of the seed matrix are deterministic-locally; states 2 and 6 stay unit-tested via the embedded shell script under pkg/provision/qemu. Co-Authored-By: Claude Opus 4.7 (1M context) --- e2e/qemu_test.go | 207 ++++++++++++++++++++++++++++++++ pkg/provision/qemu/lifecycle.go | 23 +++- pkg/provision/qemu/qemu.go | 7 ++ 3 files changed, 233 insertions(+), 4 deletions(-) diff --git a/e2e/qemu_test.go b/e2e/qemu_test.go index 5c04473..87dec52 100644 --- a/e2e/qemu_test.go +++ b/e2e/qemu_test.go @@ -565,6 +565,213 @@ func TestQemu_Seed_GateAndBypass(t *testing.T) { t.Fatalf("k3s never reached Ready after bypass+restart\nk3s journal tail:\n%s", out) } +// TestQemu_Seed_VolumeAttached exercises the production-shape happy +// path that TestQemu_Seed_GateAndBypass deliberately doesn't: +// +// * State 1 -- a labeled `y-cluster-data` ext4 volume is attached +// at boot, the pre-baked LABEL fstab entry mounts it, the seed +// unit sees a mountpoint with only lost+found, extracts the +// seed tarball, writes the marker, k3s starts via Requires=. +// * State 5 -- the same disk on the next boot has a marker; the +// seed unit hits the marker-respect no-op path; k3s starts +// without re-extract. +// +// Combined into one test function so we pay the provision + +// prepare-export cost once and stop / re-boot the same prepared +// disk twice. Without this coverage state 1 + 5 are only exercised +// by the manual GCP run, where a single flaky symptom is +// expensive to reproduce -- the local form takes ~3 min and is +// deterministic. +func TestQemu_Seed_VolumeAttached(t *testing.T) { + if _, err := os.Stat("/dev/kvm"); err != nil { + t.Skip("QEMU tests require /dev/kvm") + } + if err := qemu.CheckPrerequisites(); err != nil { + t.Skip(err) + } + if _, err := exec.LookPath("virt-customize"); err != nil { + t.Skip("virt-customize not on PATH; install libguestfs-tools") + } + if _, err := exec.LookPath("virt-format"); err != nil { + t.Skip("virt-format not on PATH; install libguestfs-tools") + } + + logger, _ := zap.NewDevelopment() + cfg := e2eQEMURuntime() + cfg.Name = "y-cluster-e2e-seed-volume" + cfg.Context = "y-cluster-e2e-seed-volume" + cfg.CacheDir = t.TempDir() + cfg.Memory = "4096" + cfg.CPUs = "2" + cfg.SSHPort = "2228" + cfg.PortForwards = e2eUniqueForwards("26448", "28448") + cfg.Kubeconfig = os.Getenv("KUBECONFIG") + if cfg.Kubeconfig == "" { + t.Skip("KUBECONFIG must be set") + } + t.Setenv("Y_CLUSTER_QEMU_CACHE_DIR", cfg.CacheDir) + + ctx := context.Background() + + // Build the appliance + plant a sentinel under /data/yolean so + // state 1's extract has something verifiable when we read back + // the customer-side mount. + cluster, err := qemu.Provision(ctx, cfg, logger) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = cluster.Teardown(false) }) + + if out, err := cluster.SSH(ctx, "sudo mkdir -p /data/yolean && echo seed-volume-v1 | sudo tee /data/yolean/sentinel.txt >/dev/null"); err != nil { + t.Fatalf("plant sentinel: %s: %v", out, err) + } + + if err := qemu.Stop(cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("Stop: %v", err) + } + if err := qemu.PrepareExport(ctx, cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("PrepareExport: %v", err) + } + + // Build a labeled ext4 qcow2 to act as the customer's persistent + // /data/yolean volume. Filesystem label matches the LABEL fstab + // entry prepare-export pre-baked into the appliance. + dataDisk := filepath.Join(cfg.CacheDir, cfg.Name+"-data.qcow2") + makeLabeledDataDisk(t, dataDisk, "y-cluster-data", "1G") + + // === Boot 1: state 1 -- empty volume, extract === + cluster1, err := qemu.StartForDiagnosticWithDisks(ctx, cfg.CacheDir, cfg.Name, []string{dataDisk}, logger) + if err != nil { + t.Fatalf("StartForDiagnosticWithDisks (boot 1): %v", err) + } + + // Seed unit must reach `active` once it sees the mountpoint + + // empty mount + extracts. + if state := waitForSeedState(t, ctx, cluster1, "active", 90*time.Second); state != "active" { + out, _ := cluster1.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + t.Fatalf("seed unit never reached active on first boot; last=%q\njournal:\n%s", state, out) + } + + // /data/yolean is the labeled volume now (not the boot disk). + if out, err := cluster1.SSH(ctx, "mountpoint -q /data/yolean && echo mounted"); err != nil { + t.Fatalf("mountpoint check: %s: %v", out, err) + } else if !strings.Contains(string(out), "mounted") { + t.Errorf("expected /data/yolean to be a mountpoint, got: %s", out) + } + + // Sentinel restored from the seed tarball. + if out, err := cluster1.SSH(ctx, "cat /data/yolean/sentinel.txt"); err != nil { + t.Fatalf("sentinel read: %v", err) + } else if !strings.Contains(string(out), "seed-volume-v1") { + t.Errorf("seed extract did not restore sentinel; got: %s", out) + } + + // Marker present on the customer volume. + if out, err := cluster1.SSH(ctx, "sudo cat /data/yolean/.y-cluster-seeded"); err != nil { + t.Fatalf("marker read: %v", err) + } else if !strings.Contains(string(out), "seed_sha256") { + t.Errorf("marker should contain seed_sha256: %s", out) + } + + // Bypass sentinel must NOT exist -- we went the production path, + // not the bypass path. This distinguishes state 1 from state 4. + if out, err := cluster1.SSH(ctx, "test -f /data/yolean/.y-cluster-seeded-via-bypass && echo present || echo absent"); err != nil { + t.Fatalf("bypass-sentinel check: %v", err) + } else if !strings.Contains(string(out), "absent") { + t.Errorf("bypass sentinel should not exist on a state-1 boot: %s", out) + } + + // k3s should come up via Requires=y-cluster-data-seed.service + // without any manual restart, since the seed unit is `active`. + if !waitForK3sReady(t, ctx, cluster1, 3*time.Minute) { + out, _ := cluster1.SSH(ctx, "sudo journalctl -u k3s.service -b --no-pager | tail -50") + t.Fatalf("k3s never reached Ready on first boot\nk3s journal tail:\n%s", out) + } + + // === Boot 2: state 5 -- marker present, no-op === + if err := qemu.Stop(cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("Stop between boots: %v", err) + } + cluster2, err := qemu.StartForDiagnosticWithDisks(ctx, cfg.CacheDir, cfg.Name, []string{dataDisk}, logger) + if err != nil { + t.Fatalf("StartForDiagnosticWithDisks (boot 2): %v", err) + } + + if state := waitForSeedState(t, ctx, cluster2, "active", 60*time.Second); state != "active" { + out, _ := cluster2.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + t.Fatalf("seed unit not active on second boot; last=%q\njournal:\n%s", state, out) + } + + // Journal must indicate the marker-respect no-op path -- not the + // extract path -- otherwise we'd be silently re-extracting on + // every boot, which would clobber any customer changes. + journalOut, err := cluster2.SSH(ctx, "sudo journalctl -u y-cluster-data-seed.service -b --no-pager") + if err != nil { + t.Fatalf("journalctl on boot 2: %v", err) + } + if !strings.Contains(string(journalOut), "marker present") { + t.Errorf("boot 2 should hit the marker-respect path; journal:\n%s", journalOut) + } + if strings.Contains(string(journalOut), "extracting") { + t.Errorf("boot 2 should NOT re-extract; journal mentions extracting:\n%s", journalOut) + } + + // Sentinel content unchanged across the two boots (i.e., we did + // not silently re-extract over customer state). + if out, err := cluster2.SSH(ctx, "cat /data/yolean/sentinel.txt"); err != nil { + t.Fatalf("sentinel read on boot 2: %v", err) + } else if !strings.Contains(string(out), "seed-volume-v1") { + t.Errorf("sentinel mutated across boots: %s", out) + } + + if !waitForK3sReady(t, ctx, cluster2, 3*time.Minute) { + out, _ := cluster2.SSH(ctx, "sudo journalctl -u k3s.service -b --no-pager | tail -50") + t.Fatalf("k3s never reached Ready on boot 2\nk3s journal tail:\n%s", out) + } +} + +// makeLabeledDataDisk creates a qcow2 file at path with a single +// ext4 filesystem labeled `label`, sized `size` (a qemu-img-style +// string like "1G"). Uses libguestfs's virt-format so the test +// doesn't need root + losetup; libguestfs is already a hard prereq +// of prepare-export, so anything that runs the rest of this file +// has it. +func makeLabeledDataDisk(t *testing.T, path, label, size string) { + t.Helper() + if out, err := exec.Command("qemu-img", "create", "-f", "qcow2", path, size).CombinedOutput(); err != nil { + t.Fatalf("qemu-img create %s: %s: %v", path, out, err) + } + // virt-format with --filesystem makes the WHOLE disk one ext4 + // filesystem (no partition table). The kernel + LABEL fstab + // match by filesystem label regardless of partitioning, so this + // is the simplest shape that satisfies the appliance contract. + if out, err := exec.Command("virt-format", + "-a", path, + "--filesystem=ext4", + "--label="+label, + ).CombinedOutput(); err != nil { + t.Fatalf("virt-format %s: %s: %v", path, out, err) + } +} + +// waitForK3sReady polls in-VM `k3s kubectl get nodes` for a Ready +// node up to timeout. Returns true on success, false on timeout. +// Used by the seed-volume tests since they boot via +// StartForDiagnosticWithDisks and don't import the kubeconfig +// host-side. +func waitForK3sReady(t *testing.T, ctx context.Context, cluster *qemu.Cluster, timeout time.Duration) bool { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, _ := cluster.SSH(ctx, "sudo k3s kubectl get nodes --no-headers 2>/dev/null || true") + if strings.Contains(string(out), "Ready") { + return true + } + time.Sleep(3 * time.Second) + } + return false +} + // waitForSeedState polls `systemctl is-active y-cluster-data-seed.service` // against the VM until it reports `want` or the timeout fires. Returns // the last observed state so the caller can include it in the failure diff --git a/pkg/provision/qemu/lifecycle.go b/pkg/provision/qemu/lifecycle.go index 01f4498..6a721bf 100644 --- a/pkg/provision/qemu/lifecycle.go +++ b/pkg/provision/qemu/lifecycle.go @@ -154,7 +154,7 @@ func guestPoweroff(cacheDir, name string, pid int, logger *zap.Logger) error { // then re-imports the kubeconfig so the host-side context is // fresh even if it was cleaned while the cluster was down. func Start(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { - c, err := startVMReady(ctx, cacheDir, name, logger) + c, err := startVMReady(ctx, cacheDir, name, nil, logger) if err != nil { return nil, err } @@ -186,13 +186,27 @@ func Start(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Clu // SCP wired up; Kubeconfig is initialised but no kubeconfig has // been imported. func StartForDiagnostic(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { - return startVMReady(ctx, cacheDir, name, logger) + return startVMReady(ctx, cacheDir, name, nil, logger) +} + +// StartForDiagnosticWithDisks is StartForDiagnostic with extra qcow2 +// or raw disks attached as additional virtio drives. Used by tests +// that exercise the appliance's pre-baked LABEL fstab against a +// real labeled data volume -- the qemu provisioner itself doesn't +// manage data disks (the appliance contract is "customer attaches a +// labeled volume", not "y-cluster wires a fleet of disks"), so this +// API surface is deliberately minimal: the caller owns the disk +// files' lifecycle and just passes their paths. +func StartForDiagnosticWithDisks(ctx context.Context, cacheDir, name string, extraDisks []string, logger *zap.Logger) (*Cluster, error) { + return startVMReady(ctx, cacheDir, name, extraDisks, logger) } // startVMReady is the prefix shared by Start and StartForDiagnostic: // load state, boot the VM, wait for SSH. Anything that requires k3s -// to be up belongs in Start, not here. -func startVMReady(ctx context.Context, cacheDir, name string, logger *zap.Logger) (*Cluster, error) { +// to be up belongs in Start, not here. extraDisks is appended after +// the boot disk + cidata seed; nil/empty means "boot disk + seed +// only", the default. +func startVMReady(ctx context.Context, cacheDir, name string, extraDisks []string, logger *zap.Logger) (*Cluster, error) { if logger == nil { logger = zap.NewNop() } @@ -224,6 +238,7 @@ func startVMReady(ctx context.Context, cacheDir, name string, logger *zap.Logger pidFile: pidFilePath(cfg.CacheDir, cfg.Name), logger: logger, Kubeconfig: kubecfg, + extraDisks: extraDisks, } if err := c.startVM(ctx, diskPath, ""); err != nil { diff --git a/pkg/provision/qemu/qemu.go b/pkg/provision/qemu/qemu.go index 602948b..6989e6b 100644 --- a/pkg/provision/qemu/qemu.go +++ b/pkg/provision/qemu/qemu.go @@ -168,6 +168,10 @@ type Cluster struct { pidFile string logger *zap.Logger Kubeconfig *kubeconfig.Manager + // extraDisks is appended after boot+seed in startVM. Set by + // StartForDiagnosticWithDisks for tests that need a labeled + // data volume attached; production code paths leave it nil. + extraDisks []string } // CheckPrerequisites verifies that required binaries and /dev/kvm exist. @@ -715,6 +719,9 @@ func (c *Cluster) startVM(ctx context.Context, diskPath, seedPath string) error if seedPath != "" { args = append(args, "-drive", fmt.Sprintf("file=%s,format=raw,if=virtio", seedPath)) } + for _, d := range c.extraDisks { + args = append(args, "-drive", fmt.Sprintf("file=%s,format=qcow2,if=virtio", d)) + } args = append(args, "-netdev", c.buildNetdev(), "-device", "virtio-net-pci,netdev=net0", From 3826306d198bae8a8db610b435626c8835105422 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Wed, 6 May 2026 05:00:45 +0000 Subject: [PATCH 6/7] test(e2e/qemu): cover prepare-export graceful-shutdown signaling Synthetic Pod sleeps 15s in its SIGTERM handler while writing one marker file per second to a local-path PVC, ending with done.txt. Local-path PVs land under /data/yolean which prepare-export packs into /var/lib/y-cluster/data-seed.tar.zst. The test cracks the tarball open via guestfish + zstd + tar and asserts step-15.txt + done.txt are present in the seed bundle, which proves the kubelet honoured the full 30s terminationGracePeriodSeconds across the qemu shutdown path. Catches: - SIGTERM not delivered to pods on `y-cluster stop` -> no markers past started.txt. - Grace period cut short (kubelet kill at <15s) -> step-N.txt for some N<15, no step-15.txt, no done.txt. - PVC didn't reach /data/yolean -> started.txt timeout in the pre-stop wait loop, fail before stop. Why it matters: mariadb's grastate.dat write happens inside its own SIGTERM handler. A torn shutdown means grastate.dat is not in the seed; the customer-side first boot (fresh disk + extracted seed) ends up in galera force-bootstrap with a missing-file sed crash, and keycloak-admin returns the envoy `UD` response flag ("unconditional drop overload") because no upstream is healthy. This synthetic workload tests the same property (full grace period elapsed) without depending on mariadb specifically. Path parity with appliance-qemu-to-gcp.sh: the script's local stages 1-3 invoke `y-cluster provision/stop/prepare-export` over the CLI; those subcommands call qemu.Provision/Stop/PrepareExport from pkg/provision/qemu, which is what this test calls directly. No CLI vs library divergence -- it's the same code path. Build-tagged `e2e && kvm` to match the rest of the qemu e2e suite. Adds ~5 min to a serial run of the e2e suite (in line with what other qemu tests cost). Co-Authored-By: Claude Opus 4.7 (1M context) --- e2e/qemu_prepare_export_signaling_test.go | 232 ++++++++++++++++++ .../prepare-export-signaling/deployment.yaml | 89 +++++++ 2 files changed, 321 insertions(+) create mode 100644 e2e/qemu_prepare_export_signaling_test.go create mode 100644 testdata/prepare-export-signaling/deployment.yaml diff --git a/e2e/qemu_prepare_export_signaling_test.go b/e2e/qemu_prepare_export_signaling_test.go new file mode 100644 index 0000000..e4a638f --- /dev/null +++ b/e2e/qemu_prepare_export_signaling_test.go @@ -0,0 +1,232 @@ +//go:build e2e && kvm + +package e2e + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "testing" + "time" + + "go.uber.org/zap" + + "github.com/Yolean/y-cluster/pkg/provision/qemu" +) + +// TestQemu_PrepareExport_GracefulShutdown covers the "y-cluster +// stop -> prepare-export gives workloads their full +// terminationGracePeriodSeconds before snapshotting" property. +// +// Without this, a workload whose final-state write happens in its +// SIGTERM handler (mariadb's grastate.dat being the canonical +// example -- a missing grastate.dat in the seed bundle puts the +// customer's first boot into Galera force-bootstrap and +// CrashLoopBackOff) loses that final state from the seed bundle, +// and a customer-side first boot from the seed misses it. +// +// The synthetic workload sleeps 15s in its SIGTERM handler while +// writing one marker per second to a local-path PVC. Local-path +// PVs land under /data/yolean, which prepare-export packs into +// /var/lib/y-cluster/data-seed.tar.zst. The test cracks open the +// tarball post-export via guestfish and asserts step-15.txt + +// done.txt are present, which proves the kubelet honored the full +// 30s terminationGracePeriodSeconds across the cluster shutdown. +// +// Failure modes the test surfaces: +// - SIGTERM not delivered to pods on `y-cluster stop` -> +// no markers past started.txt. +// - Grace period cut short (kubelet kill at <15s) -> +// step-N.txt for some N<15, no step-15.txt, no done.txt. +// - Test workload's PVC didn't reach /data/yolean -> +// started.txt timeout in the wait loop, fail before stop. +func TestQemu_PrepareExport_GracefulShutdown(t *testing.T) { + if _, err := os.Stat("/dev/kvm"); err != nil { + t.Skip("QEMU tests require /dev/kvm") + } + if err := qemu.CheckPrerequisites(); err != nil { + t.Skip(err) + } + if _, err := exec.LookPath("virt-customize"); err != nil { + t.Skip("virt-customize not on PATH; install libguestfs-tools") + } + if _, err := exec.LookPath("guestfish"); err != nil { + t.Skip("guestfish not on PATH; install libguestfs-tools") + } + if _, err := exec.LookPath("zstd"); err != nil { + t.Skip("zstd not on PATH") + } + + logger, _ := zap.NewDevelopment() + cfg := e2eQEMURuntime() + cfg.Name = "y-cluster-e2e-graceful" + cfg.Context = "y-cluster-e2e-graceful" + cfg.CacheDir = t.TempDir() + cfg.Memory = "4096" + cfg.CPUs = "2" + cfg.SSHPort = "2229" + cfg.PortForwards = e2eUniqueForwards("26449", "28449") + cfg.Kubeconfig = os.Getenv("KUBECONFIG") + if cfg.Kubeconfig == "" { + t.Skip("KUBECONFIG must be set") + } + t.Setenv("Y_CLUSTER_QEMU_CACHE_DIR", cfg.CacheDir) + + ctx := context.Background() + + cluster, err := qemu.Provision(ctx, cfg, logger) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = cluster.Teardown(false) }) + + // Apply the test workload manifest. Bundled + // local-path-provisioner creates the PV under + // /data/yolean/_-/, and the Pod's marker writes + // land there. + if err := kubectlApply(ctx, cfg.Context, cfg.Kubeconfig, + "../testdata/prepare-export-signaling/deployment.yaml"); err != nil { + t.Fatalf("kubectl apply: %v", err) + } + + // Wait for the workload to be Available. kubectl wait short- + // circuits the existing kubelet ordering: by the time it + // returns, the Pod is Ready and the trap is installed. + if err := kubectlWaitAvailable(ctx, cfg.Context, cfg.Kubeconfig, + "prepare-export-signaling", "deployment/shutdown-tester", + 3*time.Minute); err != nil { + out, _ := cluster.SSH(ctx, + "sudo k3s kubectl -n prepare-export-signaling get pods,pvc,events 2>&1") + t.Fatalf("workload not Available: %v\ncluster state:\n%s", err, out) + } + + // Wait for the workload to actually write its startup marker. + // Available != bytes-on-disk; the trap-arming + first + // `date > started.txt` runs after the Ready probe passes. + startedPath := waitForStartedMarker(t, ctx, cluster, 60*time.Second) + if startedPath == "" { + out, _ := cluster.SSH(ctx, "sudo find /data/yolean -path '*prepare-export-signaling*' -ls 2>&1") + t.Fatalf("started.txt never appeared after 60s\nfind output:\n%s", out) + } + pvDir := filepath.Dir(startedPath) + t.Logf("workload PV dir on appliance: %s", pvDir) + + // Stop -- the path under test. systemd shuts down the + // kubelet/containerd/k3s, which in turn SIGTERMs running + // pods. The trap should run for 15s and write step-{1..15}.txt + // + done.txt before the 30s terminationGracePeriodSeconds + // expires. + stopStart := time.Now() + if err := qemu.Stop(cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("Stop: %v", err) + } + t.Logf("Stop elapsed: %s", time.Since(stopStart)) + + // prepare-export packs /data/yolean (which now contains the + // post-shutdown markers) into /var/lib/y-cluster/data-seed.tar.zst. + if err := qemu.PrepareExport(ctx, cfg.CacheDir, cfg.Name, logger); err != nil { + t.Fatalf("PrepareExport: %v", err) + } + + // Crack open the seed tarball via libguestfs. The disk has + // been prepared (cluster is offline at this point), so we + // copy the tarball out via guestfish directly. + seedDest := t.TempDir() + if out, err := exec.Command("guestfish", + "--ro", + "-a", cluster.DiskPath(), + "-i", + "copy-out", "/var/lib/y-cluster/data-seed.tar.zst", seedDest, + ).CombinedOutput(); err != nil { + t.Fatalf("guestfish copy-out: %s: %v", out, err) + } + seedTarball := filepath.Join(seedDest, "data-seed.tar.zst") + if _, err := os.Stat(seedTarball); err != nil { + t.Fatalf("seed tarball not extracted: %v", err) + } + + listCmd := exec.Command("sh", "-c", + fmt.Sprintf("zstd -d --stdout %q | tar tf -", + seedTarball)) + listOut, err := listCmd.CombinedOutput() + if err != nil { + t.Fatalf("inspect seed: %s: %v", listOut, err) + } + listing := string(listOut) + + // Required markers. step-15.txt missing means the trap got + // killed before its loop completed; done.txt missing means + // the trap ran but didn't reach its final write. + for _, want := range []string{ + "prepare-export-signaling_markers", + "started.txt", + "step-15.txt", + "done.txt", + } { + if !strings.Contains(listing, want) { + t.Errorf("seed bundle missing %q\nfull listing:\n%s", want, listing) + } + } + + // Diagnostic: how many step markers actually made it. A + // healthy run logs 15/15. A truncated run shows where the + // kubelet pulled the rug. + stepRE := regexp.MustCompile(`/step-(\d+)\.txt`) + matches := stepRE.FindAllStringSubmatch(listing, -1) + t.Logf("step markers in seed: %d/15", len(matches)) +} + +// kubectlApply runs `kubectl --context= apply -f ` +// against the e2e cluster, with KUBECONFIG passed via env (the +// rest of qemu_test.go uses that pattern; mirror it here). +func kubectlApply(ctx context.Context, ctxName, kcfgPath, file string) error { + cmd := exec.CommandContext(ctx, "kubectl", + "--context="+ctxName, + "apply", "-f", file) + cmd.Env = append(os.Environ(), "KUBECONFIG="+kcfgPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %w", out, err) + } + return nil +} + +// kubectlWaitAvailable blocks until the named Deployment reports +// condition=Available or the timeout fires. +func kubectlWaitAvailable(ctx context.Context, ctxName, kcfgPath, namespace, target string, timeout time.Duration) error { + cmd := exec.CommandContext(ctx, "kubectl", + "--context="+ctxName, + "-n", namespace, + "wait", "--for=condition=available", + "--timeout="+timeout.String(), + target) + cmd.Env = append(os.Environ(), "KUBECONFIG="+kcfgPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %w", out, err) + } + return nil +} + +// waitForStartedMarker polls in-VM for the test workload's +// /markers/started.txt until it appears or timeout. Returns the +// absolute on-disk path to the marker (under /data/yolean), or +// empty on timeout. +func waitForStartedMarker(t *testing.T, ctx context.Context, cluster *qemu.Cluster, timeout time.Duration) string { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := cluster.SSH(ctx, + "sudo find /data/yolean -name started.txt -path '*prepare-export-signaling*' 2>/dev/null | head -1") + if err == nil { + line := strings.TrimSpace(string(out)) + if line != "" { + return line + } + } + time.Sleep(2 * time.Second) + } + return "" +} diff --git a/testdata/prepare-export-signaling/deployment.yaml b/testdata/prepare-export-signaling/deployment.yaml new file mode 100644 index 0000000..f95312a --- /dev/null +++ b/testdata/prepare-export-signaling/deployment.yaml @@ -0,0 +1,89 @@ +# Test workload for TestQemu_PrepareExport_GracefulShutdown. +# +# The Pod's trap runs for ~15 seconds in response to SIGTERM, +# writing one file per second to a local-path PVC. local-path +# PVs land under /data/yolean (the y-cluster bundled storage +# root), which `y-cluster prepare-export` packs into the seed +# tarball. The Go test cracks the tarball open afterwards and +# asserts step-15.txt + done.txt are present, which proves +# the kubelet honored the full terminationGracePeriodSeconds +# during cluster shutdown. +# +# Failure modes the test catches via this workload: +# - SIGTERM not delivered to pods on `y-cluster stop` -> +# no markers past started.txt. +# - Grace period cut short (e.g. kubelet killed pods at 5s) -> +# step-N.txt for some N<15, no done.txt, no step-15.txt. +# - Healthy path -> all 15 step markers + done.txt. +--- +apiVersion: v1 +kind: Namespace +metadata: + name: prepare-export-signaling +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: markers + namespace: prepare-export-signaling +spec: + accessModes: [ReadWriteOnce] + storageClassName: local-path + resources: + requests: + storage: 10Mi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shutdown-tester + namespace: prepare-export-signaling +spec: + replicas: 1 + selector: + matchLabels: + app: shutdown-tester + strategy: + type: Recreate # PVC is RWO; rolling update would deadlock + template: + metadata: + labels: + app: shutdown-tester + spec: + # 30s budget: 15s for the trap + slack for the kubelet to + # propagate SIGTERM and reap. Production-realistic: matches + # the grace period y-cluster's spot-node Pods rely on. + terminationGracePeriodSeconds: 30 + containers: + - name: tester + image: alpine:3 + command: ["/bin/sh", "-c"] + args: + - | + set -eu + mkdir -p /markers + # Startup marker: proves the Pod ran + reached the PV + # before stop fired. + date -u +%FT%T.%NZ > /markers/started.txt + sync + # SIGTERM handler: 15 step markers (one per second) + + # a final done.txt. The trap runs in the foreground, + # so the parent process exits with code 0 only after + # done.txt is on disk + sync'd. + trap 'for i in $(seq 1 15); do + date -u +%FT%T.%NZ > /markers/step-$i.txt + sync + sleep 1 + done + date -u +%FT%T.%NZ > /markers/done.txt + sync + exit 0' TERM + # Block forever; only the trap exits. + while true; do sleep 60 & wait; done + volumeMounts: + - name: markers + mountPath: /markers + volumes: + - name: markers + persistentVolumeClaim: + claimName: markers From 5ada0ce3b84baeb9f429014c5d326544edeb9218 Mon Sep 17 00:00:00 2001 From: Yolean k8s-qa Date: Wed, 6 May 2026 16:13:28 +0000 Subject: [PATCH 7/7] fix(provision/qemu): teardown logs reflect actual file ops Previously teardown unconditionally logged "teardown complete, disk and keypair deleted" regardless of what actually happened. The os.Remove calls swallowed all errors (including IsNotExist) with `_ =`, so a teardown against an already-empty cache dir -- common after prepare-export consumed the qcow2, or when an operator pointed at the wrong --cacheDir -- still claimed deletion that never occurred. Now we track which artefacts the loop actually removed, log "teardown complete, no artefacts found to delete" when the list is empty, and emit a `removed` field listing the basenames when it isn't. Real os.Remove failures (anything other than IsNotExist) surface as Warn lines instead of disappearing. Two new unit tests pin both halves of the contract via zaptest/observer: empty-cache must NOT claim deletion; a cache with disk + keypair must list both in the `removed` field. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/provision/qemu/qemu.go | 37 ++++++++++- pkg/provision/qemu/qemu_test.go | 105 ++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/pkg/provision/qemu/qemu.go b/pkg/provision/qemu/qemu.go index 6989e6b..4f33b2f 100644 --- a/pkg/provision/qemu/qemu.go +++ b/pkg/provision/qemu/qemu.go @@ -402,11 +402,42 @@ func TeardownConfig(cfg Config, keepDisk bool, logger *zap.Logger) error { logger.Info("teardown complete, disk preserved", zap.String("disk", diskPath)) return nil } + var removed []string for _, p := range perVMArtefacts(cfg.CacheDir, cfg.Name) { - _ = os.Remove(p) + err := os.Remove(p) + switch { + case err == nil: + removed = append(removed, filepath.Base(p)) + case os.IsNotExist(err): + // Artefact already gone -- idempotent teardown, no + // log spam. + default: + logger.Warn("teardown could not remove artefact", + zap.String("path", p), zap.Error(err)) + } + } + stateFile := statePath(cfg.CacheDir, cfg.Name) + hadState := false + if _, err := os.Stat(stateFile); err == nil { + hadState = true + } + if err := removeState(cfg.CacheDir, cfg.Name); err != nil { + logger.Warn("teardown could not remove state file", zap.Error(err)) + } else if hadState { + removed = append(removed, filepath.Base(stateFile)) + } + if len(removed) == 0 { + // Nothing to do means the cache dir didn't have anything for + // this name -- a previous teardown ran, or provision never + // ran. Either way, lying with a "deleted" log would mask + // real bugs (e.g. wrong --cacheDir). + logger.Info("teardown complete, no artefacts found to delete", + zap.String("cacheDir", cfg.CacheDir), + zap.String("name", cfg.Name)) + } else { + logger.Info("teardown complete", + zap.Strings("removed", removed)) } - _ = removeState(cfg.CacheDir, cfg.Name) - logger.Info("teardown complete, disk and keypair deleted") return nil } diff --git a/pkg/provision/qemu/qemu_test.go b/pkg/provision/qemu/qemu_test.go index d637524..ef7774e 100644 --- a/pkg/provision/qemu/qemu_test.go +++ b/pkg/provision/qemu/qemu_test.go @@ -6,6 +6,10 @@ import ( "strings" "testing" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "go.uber.org/zap/zaptest/observer" + "github.com/Yolean/y-cluster/pkg/provision/config" ) @@ -165,6 +169,107 @@ func TestTeardownConfig_DeleteDisk(t *testing.T) { } } +// TestTeardownConfig_LogsTruthfullyWhenNothingToDelete pins +// the truthful-logging contract: a teardown against a cache +// dir that holds no artefacts (already torn down, or the +// operator pointed at the wrong --cacheDir) must NOT log +// "deleted". Lying with an "X deleted" line masks real bugs +// like a wrong cache path. The previous shape unconditionally +// logged "teardown complete, disk and keypair deleted" even +// when os.Remove returned IsNotExist on every artefact. +func TestTeardownConfig_LogsTruthfullyWhenNothingToDelete(t *testing.T) { + core, recorded := observer.New(zapcore.InfoLevel) + logger := zap.New(core) + + cfg := defaultedRuntimeConfig(t) + cfg.CacheDir = t.TempDir() + cfg.Kubeconfig = "" + if err := TeardownConfig(cfg, false, logger); err != nil { + t.Fatal(err) + } + // Walk the recorded entries; we want exactly one info-level + // completion line and it must NOT claim anything was deleted. + var completion observer.LoggedEntry + for _, e := range recorded.All() { + if strings.HasPrefix(e.Message, "teardown complete") { + completion = e + } + } + if completion.Message == "" { + t.Fatalf("expected a teardown-complete log line, got: %+v", recorded.All()) + } + if !strings.Contains(completion.Message, "no artefacts found") { + t.Errorf("teardown-complete log must say nothing was deleted, got %q", completion.Message) + } + for _, f := range completion.Context { + if f.Key == "removed" { + t.Errorf("removed field should be absent on the empty-cache path, got %v", f) + } + } +} + +// TestTeardownConfig_LogsRemovedArtefacts pins the inverse: +// when artefacts exist on disk, the completion log must list +// them in a `removed` field. Consumers (the appliance build +// script in particular) rely on that signal to confirm the +// teardown actually freed the disk before they re-provision. +func TestTeardownConfig_LogsRemovedArtefacts(t *testing.T) { + core, recorded := observer.New(zapcore.InfoLevel) + logger := zap.New(core) + + cfg := defaultedRuntimeConfig(t) + cfg.CacheDir = t.TempDir() + cfg.Kubeconfig = "" + diskPath := filepath.Join(cfg.CacheDir, cfg.Name+".qcow2") + if err := os.WriteFile(diskPath, []byte("fake"), 0o644); err != nil { + t.Fatal(err) + } + keyPath := filepath.Join(cfg.CacheDir, cfg.Name+"-ssh") + if err := os.WriteFile(keyPath, []byte("fake"), 0o600); err != nil { + t.Fatal(err) + } + + if err := TeardownConfig(cfg, false, logger); err != nil { + t.Fatal(err) + } + var completion observer.LoggedEntry + for _, e := range recorded.All() { + if strings.HasPrefix(e.Message, "teardown complete") { + completion = e + } + } + if completion.Message == "" { + t.Fatalf("expected a teardown-complete log line, got: %+v", recorded.All()) + } + if strings.Contains(completion.Message, "no artefacts found") { + t.Errorf("non-empty teardown should not log 'no artefacts found': %q", completion.Message) + } + // zap's ArrayMarshaler types aren't trivially assertable + // off Field.Interface; ContextMap walks the encoder so we + // get []any for a Strings field. + ctxMap := completion.ContextMap() + rawRemoved, ok := ctxMap["removed"].([]any) + if !ok { + t.Fatalf("removed field missing or wrong shape on completion log: %v", ctxMap) + } + wantPresent := map[string]bool{ + filepath.Base(diskPath): false, + filepath.Base(keyPath): false, + } + for _, item := range rawRemoved { + if name, _ := item.(string); name != "" { + if _, want := wantPresent[name]; want { + wantPresent[name] = true + } + } + } + for name, seen := range wantPresent { + if !seen { + t.Errorf("removed list missing %q (got %v)", name, rawRemoved) + } + } +} + // TestTeardownConfig_DeletesKeypair pins the no-key-reuse contract: // teardown must remove the SSH keypair (and the other per-VM // artefacts) so the next provision generates a fresh one. Reusing