cachebox-project · EdHasNoLife · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/.github/workflows/c2-reconciler-canary.yml b/.github/workflows/c2-reconciler-canary.yml
@@ -0,0 +1,57 @@
+# On-demand / scheduled CPU canary for the C2 CacheBackend reconciler.
+#
+# Runs docs/reference-stack/scripts/canary_c2_reconcile.sh: brings up a kind
+# cluster, runs the controller, applies a CPU-profile CacheBackend, and asserts
+# the reconciler stands up a healthy serving backend (status.health=Ready,
+# endpoint published), an engine prefix-cache hit, and owner-ref GC on delete.
+# GPU-free.
+#
+# This is NOT a per-PR gate (it pulls a multi-GB image, needs Docker + kind, and
+# ~10 GiB RAM); it runs on a schedule and on manual dispatch.
+name: c2-reconciler-canary
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Runner label (override to target a self-hosted Docker host)"
+        default: ubuntu-latest
+        required: false
+  schedule:
+    - cron: "30 7 * * *" # nightly 07:30 UTC
+
+permissions:
+  contents: read
+
+concurrency:
+  group: c2-reconciler-canary
+  cancel-in-progress: false
+
+jobs:
+  canary:
+    runs-on: ${{ github.event.inputs.runner || 'ubuntu-latest' }}
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Install kind
+        uses: helm/kind-action@v1
+        with:
+          install_only: true
+
+      - name: Run C2 reconciler CPU canary
+        run: docs/reference-stack/scripts/canary_c2_reconcile.sh
+
+      - name: Upload canary logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: c2-canary-logs
+          path: |
+            /tmp/c2-canary-controller.log
+            /tmp/c2-canary-pf.log
+          if-no-files-found: ignore
diff --git a/config/samples/cachebackend-lmcache-cpu.yaml b/config/samples/cachebackend-lmcache-cpu.yaml
@@ -0,0 +1,30 @@
+# GPU-free LMCache backend for substrate validation (backendConfig profile=cpu).
+#
+# Renders a vLLM CPU engine with prefix caching + the KV-cache event publisher,
+# but WITHOUT the LMCache connector and WITHOUT a GPU resource request — real
+# LMCache offload needs a GPU (use the default gpu profile for that). This lets
+# the reconciler stand up a healthy, serving backend on a CPU-only cluster (e.g.
+# kind) to validate the engine-config + KV-event path end to end.
+#
+# IMPORTANT: the CPU image is arch-tagged upstream with no multi-arch tag, so the
+# `image` below is a deliberate non-applyable PLACEHOLDER — substitute the tag for
+# your host before applying: `latest-arm64` (arm64) or `latest-x86_64` (x86_64).
+# Applying as-is fails fast (bad tag) rather than silently running the wrong arch.
+apiVersion: inferencecache.io/v1alpha1
+kind: CacheBackend
+metadata:
+  labels:
+    app.kubernetes.io/name: inference-cache
+  name: cachebackend-lmcache-cpu
+spec:
+  type: LMCache
+  deploymentKind: Deployment
+  replicas: 1
+  integration:
+    engine: vLLM
+    role: ReadWrite
+  backendConfig:
+    profile: cpu
+    # PLACEHOLDER — replace <arch> with arm64 or x86_64 for your host before applying.
+    image: vllm/vllm-openai-cpu:latest-<arch>
+    model: Qwen/Qwen2.5-0.5B-Instruct
diff --git a/docs/design/cachebackend-api.md b/docs/design/cachebackend-api.md
@@ -52,6 +52,17 @@ The `v1alpha1` contract must remain backward-compatible where possible. New fiel
 
 It intentionally does not expose `containers`; requiring users to provide containers would conflict with managed backend defaults and would make simple scheduling overrides unnecessarily large.
 
+### backendConfig keys (managed LMCache)
+
+`spec.backendConfig` is a free-form string map; the managed LMCache builder recognizes a few keys as overrides until they are promoted to first-class spec fields:
+
+| Key | Default | Purpose |
+|---|---|---|
+| `image` | gpu: lmcache reference image; cpu: **required** | Container image for the backend engine. The CPU image is arch-tagged upstream with no safe multi-arch default, so `profile=cpu` requires an explicit image. |
+| `model` | profile-dependent | Model the engine serves (`vllm serve <model>`). |
+| `hfTokenSecret` | `hf-token` | Name of the Secret (key `token`) injected as `HF_TOKEN` for gated model pulls. The reference is optional, so ungated models run without it. |
+| `profile` | `gpu` | Rendering profile. `gpu` (default): the full vLLM + LMCache connector with prefix caching, KV events, and an `nvidia.com/gpu` limit. `cpu`: a GPU-free vLLM engine (no GPU limit, no LMCache connector) that keeps prefix caching + the KV-event publisher, for validating the substrate off-GPU. Real LMCache offload requires a GPU, so it stays on the `gpu` profile. |
+
 ## Status
 
 | Field | Type | Purpose |

diff --git a/docs/reference-stack/README.md b/docs/reference-stack/README.md
@@ -142,6 +142,27 @@ run it after changing the subscriber.
 
 ---
 
+## CacheBackend reconciler canary (CPU)
+
+[`scripts/canary_c2_reconcile.sh`](scripts/canary_c2_reconcile.sh) is a GPU-free,
+on-demand canary for the **C2 reconciler**: it brings up a kind cluster, runs the
+controller, applies a `CacheBackend` with `backendConfig.profile: cpu`, and asserts
+the controller stands up a healthy serving backend (`status.health=Ready`, endpoint
+published), an engine prefix-cache hit through the Service, and owner-ref garbage
+collection when the CR is deleted. It exercises the reconciler against real pods —
+the gap the envtest unit tests can't cover.
+
+```bash
+docs/reference-stack/scripts/canary_c2_reconcile.sh
+```
+
+Like the full-chain canary it is **on-demand**, not a blocking gate: it needs
+Docker + kind, pulls the vLLM CPU image, and wants ~10+ GiB of Docker VM RAM. The
+`cpu` profile runs a GPU-free vLLM engine (prefix caching + KV events, no LMCache
+offload); real LMCache offload still needs a GPU (the default `gpu` profile).
+
+---
+
 ## Teardown
 
 ```bash

diff --git a/docs/reference-stack/scripts/canary_c2_reconcile.sh b/docs/reference-stack/scripts/canary_c2_reconcile.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+# CPU canary for the C2 CacheBackend reconciler. Proves the controller stands up a
+# healthy, serving backend from a CR on a GPU-free cluster (kind):
+#
+#   kubectl apply CacheBackend(profile=cpu) --> controller --> Deployment + Service
+#     --> CPU vLLM pods become Ready --> status.health=Ready, status.endpoint set
+#
+# Optionally drives prefix traffic through the Service and checks an engine
+# prefix-cache hit. Deleting the CR garbage-collects the children via owner refs.
+#
+# This exercises the reconciler end to end against real pods — the gap envtest
+# can't cover. It uses the CPU profile (no GPU, no LMCache offload); real LMCache
+# offload needs a GPU (default profile).
+#
+# On-demand canary (NOT a per-PR gate): needs Docker + kind + kubectl, pulls the
+# multi-GB vLLM CPU image, and a Docker VM with ~10+ GiB RAM (CPU runtime baseline
+# ~5 GiB + KV cache). See docs/reference-stack/VERSIONS.md.
+#
+# Usage:  docs/reference-stack/scripts/canary_c2_reconcile.sh
+# Tunables via env: IMAGE, MODEL, KIND_CLUSTER, NAMESPACE, READY_TIMEOUT, SKIP_TRAFFIC.
+set -euo pipefail
+
+arch="$(uname -m)"
+case "$arch" in
+  arm64 | aarch64) IMAGE_TAG="${IMAGE_TAG:-latest-arm64}" ;;
+  *) IMAGE_TAG="${IMAGE_TAG:-latest-x86_64}" ;;
+esac
+IMAGE="${IMAGE:-vllm/vllm-openai-cpu:$IMAGE_TAG}"
+MODEL="${MODEL:-Qwen/Qwen2.5-0.5B-Instruct}"
+KIND_CLUSTER="${KIND_CLUSTER:-ic-c2-canary}"
+NAMESPACE="${NAMESPACE:-c2-canary}"
+CR_NAME="${CR_NAME:-canary}"
+READY_TIMEOUT="${READY_TIMEOUT:-900}" # seconds for the CPU model to load + become Ready
+SKIP_TRAFFIC="${SKIP_TRAFFIC:-0}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+cd "$REPO_ROOT"
+
+KIND="${KIND:-$([ -x ./bin/kind ] && echo ./bin/kind || echo kind)}"
+controller_pid=""
+pf_pid=""
+log() { echo "[c2-canary] $*"; }
+fail() {
+  echo "[c2-canary] FAIL: $*" >&2
+  exit 1
+}
+
+cleanup() {
+  [ -n "$pf_pid" ] && kill "$pf_pid" 2>/dev/null || true
+  [ -n "$controller_pid" ] && kill "$controller_pid" 2>/dev/null || true
+  "$KIND" delete cluster --name "$KIND_CLUSTER" >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
+
+# --- cluster ----------------------------------------------------------------
+log "creating kind cluster $KIND_CLUSTER"
+"$KIND" create cluster --name "$KIND_CLUSTER" --wait 120s
+KUBECONFIG_ARGS=(--context "kind-$KIND_CLUSTER")
+
+log "pulling CPU image and loading it into the node ($IMAGE)"
+docker pull "$IMAGE"
+"$KIND" load docker-image "$IMAGE" --name "$KIND_CLUSTER"
+
+# --- controller -------------------------------------------------------------
+log "installing CRD"
+kubectl "${KUBECONFIG_ARGS[@]}" apply -f config/crd/bases/inferencecache.io_cachebackends.yaml
+
+log "building + starting the controller"
+go build -o bin/controller ./cmd/controller
+./bin/controller --leader-elect=false >/tmp/c2-canary-controller.log 2>&1 &
+controller_pid=$!
+
+kubectl "${KUBECONFIG_ARGS[@]}" create namespace "$NAMESPACE"
+
+# --- apply the CacheBackend (CPU profile) -----------------------------------
+log "applying CacheBackend $NAMESPACE/$CR_NAME (profile=cpu, image=$IMAGE)"
+kubectl "${KUBECONFIG_ARGS[@]}" apply -f - <<EOF
+apiVersion: inferencecache.io/v1alpha1
+kind: CacheBackend
+metadata:
+  name: $CR_NAME
+  namespace: $NAMESPACE
+spec:
+  type: LMCache
+  deploymentKind: Deployment
+  replicas: 1
+  backendConfig:
+    profile: cpu
+    image: $IMAGE
+    model: $MODEL
+EOF
+
+# --- wait for the reconciler to report Ready --------------------------------
+log "waiting up to ${READY_TIMEOUT}s for status.health=Ready (CPU model load is slow)"
+deadline=$(($(date +%s) + READY_TIMEOUT))
+health=""
+until [ "$health" = "Ready" ]; do
+  health="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get cachebackend "$CR_NAME" -o jsonpath='{.status.health}' 2>/dev/null || true)"
+  if [ "$(date +%s)" -ge "$deadline" ]; then
+    kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get pods -o wide || true
+    kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" describe deployment "$CR_NAME" || true
+    fail "backend did not become Ready within ${READY_TIMEOUT}s (last health='$health')"
+  fi
+  sleep 5
+done
+log "status.health=Ready"
+
+endpoint="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get cachebackend "$CR_NAME" -o jsonpath='{.status.endpoint}')"
+[ -n "$endpoint" ] || fail "status.endpoint was not published"
+log "status.endpoint=$endpoint"
+
+avail="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deployment "$CR_NAME" -o jsonpath='{.status.availableReplicas}')"
+[ "${avail:-0}" -ge 1 ] || fail "deployment has no available replicas"
+
+# --- optional: drive prefix traffic + check a cache hit ---------------------
+if [ "$SKIP_TRAFFIC" != "1" ]; then
+  log "port-forwarding the Service to drive prefix traffic"
+  kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" port-forward "svc/$CR_NAME" 18000:8000 >/tmp/c2-canary-pf.log 2>&1 &
+  pf_pid=$!
+  for _ in $(seq 1 30); do
+    curl -sf -o /dev/null "http://localhost:18000/health" && break
+    sleep 1
+  done
+  hits() { curl -s "http://localhost:18000/metrics" | awk '/^vllm:prefix_cache_hits_total/{s+=$2} END{print s+0}'; }
+  PREFIX="$(python3 -c 'print(("You are a meticulous canary assistant. Follow the rules precisely. " * 200).strip())')"
+  fire() {
+    curl -s -o /dev/null -w '%{http_code}' "http://localhost:18000/v1/chat/completions" \
+      -H 'Content-Type: application/json' \
+      -d "$(python3 -c 'import json,sys;print(json.dumps({"model":sys.argv[3],"max_tokens":8,"temperature":0,"messages":[{"role":"system","content":sys.argv[1]},{"role":"user","content":sys.argv[2]}]}))' "$PREFIX" "$1" "$MODEL")"
+  }
+  h0=$(hits)
+  log "request 1 (cold prefix): HTTP $(fire 'summarize in one word')"
+  log "request 2 (same prefix):  HTTP $(fire 'summarize in two words')"
+  h1=$(hits)
+  log "prefix_cache_hits: $h0 -> $h1"
+  [ "$h1" -gt "$h0" ] || fail "no engine prefix-cache hit (hits did not increase)"
+fi
+
+# --- delete the CR -> owner-ref GC ------------------------------------------
+log "deleting the CR; expecting owner-ref GC of the Deployment + Service"
+kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" delete cachebackend "$CR_NAME" --wait=true
+gc_deadline=$(($(date +%s) + 60))
+until [ "$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deploy,svc -o name 2>/dev/null | wc -l | tr -d ' ')" = "0" ]; do
+  [ "$(date +%s)" -lt "$gc_deadline" ] || fail "children were not garbage-collected after CR deletion"
+  sleep 2
+done
+
+log "PASS — reconciler stood up a healthy CPU backend, published its endpoint, and cleaned up on delete"
diff --git a/internal/controller/cachebackend_controller.go b/internal/controller/cachebackend_controller.go
@@ -189,6 +189,10 @@ func (r *CacheBackendReconciler) applyService(ctx context.Context, backend *cach
 func reconcileManagedPodSpec(live *corev1.PodSpec, desired *corev1.PodSpec) {
 	reconcileManagedContainer(live, desired)
 
+	// Volumes are builder-owned (e.g. the shm size differs by profile) and are not
+	// API-server-defaulted in a Deployment template, so copying them is churn-free.
+	live.Volumes = desired.Volumes
+
 	live.NodeSelector = desired.NodeSelector
 	live.Affinity = desired.Affinity
 	live.Tolerations = desired.Tolerations
@@ -215,6 +219,9 @@ func reconcileManagedContainer(live *corev1.PodSpec, desired *corev1.PodSpec) {
 			live.Containers[i].Command = want.Command
 			live.Containers[i].Args = want.Args
 			live.Containers[i].Env = want.Env
+			// Resources are builder-owned (the GPU limit differs by profile) and not
+			// API-server-defaulted, so reconciling them is churn-free.
+			live.Containers[i].Resources = want.Resources
 			return
 		}
 	}

diff --git a/internal/controller/cachebackend_controller_test.go b/internal/controller/cachebackend_controller_test.go
@@ -218,6 +218,44 @@ func TestReconcileLMCacheUpdatesImage(t *testing.T) {
 	}
 }
 
+func TestReconcileLMCacheProfileSwitchGPUToCPU(t *testing.T) {
+	scheme := newScheme(t)
+	r := newReconciler(scheme, lmcacheBackend("cache", "ns1"))
+
+	reconcile(t, r, "cache", "ns1")
+	// GPU profile (default): GPU limit set, 8Gi shm.
+	c := getDeployment(t, r, "cache", "ns1").Spec.Template.Spec.Containers[0]
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; !ok {
+		t.Fatalf("default profile should request a GPU")
+	}
+
+	live := getBackend(t, r, "cache", "ns1")
+	live.Spec.BackendConfig = map[string]string{"profile": "cpu", "image": "vllm/vllm-openai-cpu:latest-arm64"}
+	if err := r.Update(context.Background(), live); err != nil {
+		t.Fatalf("switch to cpu profile: %v", err)
+	}
+	reconcile(t, r, "cache", "ns1")
+
+	// CPU profile must reach the live Deployment: GPU limit gone, image swapped, shm 4Gi.
+	dep := getDeployment(t, r, "cache", "ns1")
+	c = dep.Spec.Template.Spec.Containers[0]
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
+		t.Fatalf("GPU limit should be removed after switching to cpu profile")
+	}
+	if c.Image != "vllm/vllm-openai-cpu:latest-arm64" {
+		t.Fatalf("image = %q, want cpu image after profile switch", c.Image)
+	}
+	var shm *corev1.Volume
+	for i := range dep.Spec.Template.Spec.Volumes {
+		if dep.Spec.Template.Spec.Volumes[i].Name == "shm" {
+			shm = &dep.Spec.Template.Spec.Volumes[i]
+		}
+	}
+	if shm == nil || shm.EmptyDir == nil || shm.EmptyDir.SizeLimit == nil || shm.EmptyDir.SizeLimit.String() != "4Gi" {
+		t.Fatalf("shm size = %v, want 4Gi after switching to cpu profile", shm)
+	}
+}
+
 func TestReconcileLMCacheScalesReplicas(t *testing.T) {
 	scheme := newScheme(t)
 	cb := lmcacheBackend("cache", "ns1")