From c993dfc177cc9937409ee26f97a775dffd0d7a6e Mon Sep 17 00:00:00 2001
From: Edward Sun <sunxu.edward@gmail.com>
Date: Wed, 27 May 2026 14:36:18 -0700
Subject: [PATCH 1/3] Add a CPU profile and a kind-based reconciler CPU canary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lets the CacheBackend reconciler stand up a healthy, serving backend without a
GPU so the substrate can be validated off-GPU.

- backendConfig profile=cpu renders a GPU-free vLLM engine: no nvidia.com/gpu
  limit and no LMCache connector, but prefix caching and the KV-event publisher
  stay on, plus CPU flags (--dtype/--max-model-len/--enforce-eager) and
  VLLM_CPU_KVCACHE_SPACE, defaulting to the vLLM CPU image + a tiny ungated model.
  The default (gpu) profile is unchanged and still owns real LMCache offload,
  which requires a GPU. Selection is case-insensitive; unknown/empty falls back
  to gpu. Covered by unit tests; the GPU rendering is byte-identical.
- docs/reference-stack/scripts/canary_c2_reconcile.sh + an on-demand workflow:
  bring up kind, run the controller, apply a profile=cpu CacheBackend, and assert
  the reconciler reports Ready with a published endpoint, an engine prefix-cache
  hit through the Service, and owner-ref GC on delete — the real-pod path envtest
  can't cover.
- A profile=cpu sample and the backendConfig key reference in the design doc.
---
 .github/workflows/c2-reconciler-canary.yml    |  57 ++++++
 config/samples/cachebackend-lmcache-cpu.yaml  |  26 +++
 docs/design/cachebackend-api.md               |  11 ++
 docs/reference-stack/README.md                |  21 +++
 .../scripts/canary_c2_reconcile.sh            | 148 +++++++++++++++
 pkg/adapters/backend/lmcache.go               | 174 ++++++++++++------
 pkg/adapters/backend/lmcache_test.go          | 111 +++++++++++
 7 files changed, 491 insertions(+), 57 deletions(-)
 create mode 100644 .github/workflows/c2-reconciler-canary.yml
 create mode 100644 config/samples/cachebackend-lmcache-cpu.yaml
 create mode 100755 docs/reference-stack/scripts/canary_c2_reconcile.sh

diff --git a/.github/workflows/c2-reconciler-canary.yml b/.github/workflows/c2-reconciler-canary.yml
new file mode 100644
index 0000000..6084bd4
--- /dev/null
+++ b/.github/workflows/c2-reconciler-canary.yml
@@ -0,0 +1,57 @@
+# On-demand / scheduled CPU canary for the C2 CacheBackend reconciler.
+#
+# Runs docs/reference-stack/scripts/canary_c2_reconcile.sh: brings up a kind
+# cluster, runs the controller, applies a CPU-profile CacheBackend, and asserts
+# the reconciler stands up a healthy serving backend (status.health=Ready,
+# endpoint published), an engine prefix-cache hit, and owner-ref GC on delete.
+# GPU-free.
+#
+# This is NOT a per-PR gate (it pulls a multi-GB image, needs Docker + kind, and
+# ~10 GiB RAM); it runs on a schedule and on manual dispatch.
+name: c2-reconciler-canary
+
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: "Runner label (override to target a self-hosted Docker host)"
+        default: ubuntu-latest
+        required: false
+  schedule:
+    - cron: "30 7 * * *" # nightly 07:30 UTC
+
+permissions:
+  contents: read
+
+concurrency:
+  group: c2-reconciler-canary
+  cancel-in-progress: false
+
+jobs:
+  canary:
+    runs-on: ${{ github.event.inputs.runner || 'ubuntu-latest' }}
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Install kind
+        uses: helm/kind-action@v1
+        with:
+          install_only: true
+
+      - name: Run C2 reconciler CPU canary
+        run: docs/reference-stack/scripts/canary_c2_reconcile.sh
+
+      - name: Upload canary logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: c2-canary-logs
+          path: |
+            /tmp/c2-canary-controller.log
+            /tmp/c2-canary-pf.log
+          if-no-files-found: ignore
diff --git a/config/samples/cachebackend-lmcache-cpu.yaml b/config/samples/cachebackend-lmcache-cpu.yaml
new file mode 100644
index 0000000..3e76af1
--- /dev/null
+++ b/config/samples/cachebackend-lmcache-cpu.yaml
@@ -0,0 +1,26 @@
+# GPU-free LMCache backend for substrate validation (backendConfig profile=cpu).
+#
+# Renders a vLLM CPU engine with prefix caching + the KV-cache event publisher,
+# but WITHOUT the LMCache connector and WITHOUT a GPU resource request — real
+# LMCache offload needs a GPU (use the default gpu profile for that). This lets
+# the reconciler stand up a healthy, serving backend on a CPU-only cluster (e.g.
+# kind) to validate the engine-config + KV-event path end to end.
+#
+# The CPU image is arch-tagged upstream; use latest-x86_64 on x86 hosts.
+apiVersion: inferencecache.io/v1alpha1
+kind: CacheBackend
+metadata:
+  labels:
+    app.kubernetes.io/name: inference-cache
+  name: cachebackend-lmcache-cpu
+spec:
+  type: LMCache
+  deploymentKind: Deployment
+  replicas: 1
+  integration:
+    engine: vLLM
+    role: ReadWrite
+  backendConfig:
+    profile: cpu
+    image: vllm/vllm-openai-cpu:latest-arm64
+    model: Qwen/Qwen2.5-0.5B-Instruct
diff --git a/docs/design/cachebackend-api.md b/docs/design/cachebackend-api.md
index 5d14246..6321396 100644
--- a/docs/design/cachebackend-api.md
+++ b/docs/design/cachebackend-api.md
@@ -52,6 +52,17 @@ The `v1alpha1` contract must remain backward-compatible where possible. New fiel
 
 It intentionally does not expose `containers`; requiring users to provide containers would conflict with managed backend defaults and would make simple scheduling overrides unnecessarily large.
 
+### backendConfig keys (managed LMCache)
+
+`spec.backendConfig` is a free-form string map; the managed LMCache builder recognizes a few keys as overrides until they are promoted to first-class spec fields:
+
+| Key | Default | Purpose |
+|---|---|---|
+| `image` | profile-dependent | Container image for the backend engine. |
+| `model` | profile-dependent | Model the engine serves (`vllm serve <model>`). |
+| `hfTokenSecret` | `hf-token` | Name of the Secret (key `token`) injected as `HF_TOKEN` for gated model pulls. The reference is optional, so ungated models run without it. |
+| `profile` | `gpu` | Rendering profile. `gpu` (default): the full vLLM + LMCache connector with prefix caching, KV events, and an `nvidia.com/gpu` limit. `cpu`: a GPU-free vLLM engine (no GPU limit, no LMCache connector) that keeps prefix caching + the KV-event publisher, for validating the substrate off-GPU. Real LMCache offload requires a GPU, so it stays on the `gpu` profile. |
+
 ## Status
 
 | Field | Type | Purpose |
diff --git a/docs/reference-stack/README.md b/docs/reference-stack/README.md
index 31045d0..cf7f461 100644
--- a/docs/reference-stack/README.md
+++ b/docs/reference-stack/README.md
@@ -142,6 +142,27 @@ run it after changing the subscriber.
 
 ---
 
+## CacheBackend reconciler canary (CPU)
+
+[`scripts/canary_c2_reconcile.sh`](scripts/canary_c2_reconcile.sh) is a GPU-free,
+on-demand canary for the **C2 reconciler**: it brings up a kind cluster, runs the
+controller, applies a `CacheBackend` with `backendConfig.profile: cpu`, and asserts
+the controller stands up a healthy serving backend (`status.health=Ready`, endpoint
+published), an engine prefix-cache hit through the Service, and owner-ref garbage
+collection when the CR is deleted. It exercises the reconciler against real pods —
+the gap the envtest unit tests can't cover.
+
+```bash
+docs/reference-stack/scripts/canary_c2_reconcile.sh
+```
+
+Like the full-chain canary it is **on-demand**, not a blocking gate: it needs
+Docker + kind, pulls the vLLM CPU image, and wants ~10+ GiB of Docker VM RAM. The
+`cpu` profile runs a GPU-free vLLM engine (prefix caching + KV events, no LMCache
+offload); real LMCache offload still needs a GPU (the default `gpu` profile).
+
+---
+
 ## Teardown
 
 ```bash
diff --git a/docs/reference-stack/scripts/canary_c2_reconcile.sh b/docs/reference-stack/scripts/canary_c2_reconcile.sh
new file mode 100755
index 0000000..c23eba0
--- /dev/null
+++ b/docs/reference-stack/scripts/canary_c2_reconcile.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+# CPU canary for the C2 CacheBackend reconciler. Proves the controller stands up a
+# healthy, serving backend from a CR on a GPU-free cluster (kind):
+#
+#   kubectl apply CacheBackend(profile=cpu) --> controller --> Deployment + Service
+#     --> CPU vLLM pods become Ready --> status.health=Ready, status.endpoint set
+#
+# Optionally drives prefix traffic through the Service and checks an engine
+# prefix-cache hit. Deleting the CR garbage-collects the children via owner refs.
+#
+# This exercises the reconciler end to end against real pods — the gap envtest
+# can't cover. It uses the CPU profile (no GPU, no LMCache offload); real LMCache
+# offload needs a GPU (default profile).
+#
+# On-demand canary (NOT a per-PR gate): needs Docker + kind + kubectl, pulls the
+# multi-GB vLLM CPU image, and a Docker VM with ~10+ GiB RAM (CPU runtime baseline
+# ~5 GiB + KV cache). See docs/reference-stack/VERSIONS.md.
+#
+# Usage:  docs/reference-stack/scripts/canary_c2_reconcile.sh
+# Tunables via env: IMAGE, MODEL, KIND_CLUSTER, NAMESPACE, READY_TIMEOUT, SKIP_TRAFFIC.
+set -euo pipefail
+
+arch="$(uname -m)"
+case "$arch" in
+  arm64 | aarch64) IMAGE_TAG="${IMAGE_TAG:-latest-arm64}" ;;
+  *) IMAGE_TAG="${IMAGE_TAG:-latest-x86_64}" ;;
+esac
+IMAGE="${IMAGE:-vllm/vllm-openai-cpu:$IMAGE_TAG}"
+MODEL="${MODEL:-Qwen/Qwen2.5-0.5B-Instruct}"
+KIND_CLUSTER="${KIND_CLUSTER:-ic-c2-canary}"
+NAMESPACE="${NAMESPACE:-c2-canary}"
+CR_NAME="${CR_NAME:-canary}"
+READY_TIMEOUT="${READY_TIMEOUT:-900}" # seconds for the CPU model to load + become Ready
+SKIP_TRAFFIC="${SKIP_TRAFFIC:-0}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+cd "$REPO_ROOT"
+
+KIND="${KIND:-$([ -x ./bin/kind ] && echo ./bin/kind || echo kind)}"
+controller_pid=""
+pf_pid=""
+log() { echo "[c2-canary] $*"; }
+fail() {
+  echo "[c2-canary] FAIL: $*" >&2
+  exit 1
+}
+
+cleanup() {
+  [ -n "$pf_pid" ] && kill "$pf_pid" 2>/dev/null || true
+  [ -n "$controller_pid" ] && kill "$controller_pid" 2>/dev/null || true
+  "$KIND" delete cluster --name "$KIND_CLUSTER" >/dev/null 2>&1 || true
+}
+trap cleanup EXIT
+
+# --- cluster ----------------------------------------------------------------
+log "creating kind cluster $KIND_CLUSTER"
+"$KIND" create cluster --name "$KIND_CLUSTER" --wait 120s
+KUBECONFIG_ARGS=(--context "kind-$KIND_CLUSTER")
+
+log "pulling CPU image and loading it into the node ($IMAGE)"
+docker pull "$IMAGE"
+"$KIND" load docker-image "$IMAGE" --name "$KIND_CLUSTER"
+
+# --- controller -------------------------------------------------------------
+log "installing CRD"
+kubectl "${KUBECONFIG_ARGS[@]}" apply -f config/crd/bases/inferencecache.io_cachebackends.yaml
+
+log "building + starting the controller"
+go build -o bin/controller ./cmd/controller
+./bin/controller --leader-elect=false >/tmp/c2-canary-controller.log 2>&1 &
+controller_pid=$!
+
+kubectl "${KUBECONFIG_ARGS[@]}" create namespace "$NAMESPACE"
+
+# --- apply the CacheBackend (CPU profile) -----------------------------------
+log "applying CacheBackend $NAMESPACE/$CR_NAME (profile=cpu, image=$IMAGE)"
+kubectl "${KUBECONFIG_ARGS[@]}" apply -f - <<EOF
+apiVersion: inferencecache.io/v1alpha1
+kind: CacheBackend
+metadata:
+  name: $CR_NAME
+  namespace: $NAMESPACE
+spec:
+  type: LMCache
+  deploymentKind: Deployment
+  replicas: 1
+  backendConfig:
+    profile: cpu
+    image: $IMAGE
+    model: $MODEL
+EOF
+
+# --- wait for the reconciler to report Ready --------------------------------
+log "waiting up to ${READY_TIMEOUT}s for status.health=Ready (CPU model load is slow)"
+deadline=$(($(date +%s) + READY_TIMEOUT))
+health=""
+until [ "$health" = "Ready" ]; do
+  health="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get cachebackend "$CR_NAME" -o jsonpath='{.status.health}' 2>/dev/null || true)"
+  if [ "$(date +%s)" -ge "$deadline" ]; then
+    kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get pods -o wide || true
+    kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" describe deployment "$CR_NAME" || true
+    fail "backend did not become Ready within ${READY_TIMEOUT}s (last health='$health')"
+  fi
+  sleep 5
+done
+log "status.health=Ready"
+
+endpoint="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get cachebackend "$CR_NAME" -o jsonpath='{.status.endpoint}')"
+[ -n "$endpoint" ] || fail "status.endpoint was not published"
+log "status.endpoint=$endpoint"
+
+avail="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deployment "$CR_NAME" -o jsonpath='{.status.availableReplicas}')"
+[ "${avail:-0}" -ge 1 ] || fail "deployment has no available replicas"
+
+# --- optional: drive prefix traffic + check a cache hit ---------------------
+if [ "$SKIP_TRAFFIC" != "1" ]; then
+  log "port-forwarding the Service to drive prefix traffic"
+  kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" port-forward "svc/$CR_NAME" 18000:8000 >/tmp/c2-canary-pf.log 2>&1 &
+  pf_pid=$!
+  for _ in $(seq 1 30); do
+    curl -sf -o /dev/null "http://localhost:18000/health" && break
+    sleep 1
+  done
+  hits() { curl -s "http://localhost:18000/metrics" | awk '/^vllm:prefix_cache_hits_total/{s+=$2} END{print s+0}'; }
+  PREFIX="$(python3 -c 'print(("You are a meticulous canary assistant. Follow the rules precisely. " * 200).strip())')"
+  fire() {
+    curl -s -o /dev/null -w '%{http_code}' "http://localhost:18000/v1/chat/completions" \
+      -H 'Content-Type: application/json' \
+      -d "$(python3 -c 'import json,sys;print(json.dumps({"model":sys.argv[3],"max_tokens":8,"temperature":0,"messages":[{"role":"system","content":sys.argv[1]},{"role":"user","content":sys.argv[2]}]}))' "$PREFIX" "$1" "$MODEL")"
+  }
+  h0=$(hits)
+  log "request 1 (cold prefix): HTTP $(fire 'summarize in one word')"
+  log "request 2 (same prefix):  HTTP $(fire 'summarize in two words')"
+  h1=$(hits)
+  log "prefix_cache_hits: $h0 -> $h1"
+  [ "$h1" -gt "$h0" ] || fail "no engine prefix-cache hit (hits did not increase)"
+fi
+
+# --- delete the CR -> owner-ref GC ------------------------------------------
+log "deleting the CR; expecting owner-ref GC of the Deployment + Service"
+kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" delete cachebackend "$CR_NAME" --wait=true
+gc_deadline=$(($(date +%s) + 60))
+until [ "$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deploy,svc -o name 2>/dev/null | wc -l | tr -d ' ')" = "0" ]; do
+  [ "$(date +%s)" -lt "$gc_deadline" ] || fail "children were not garbage-collected after CR deletion"
+  sleep 2
+done
+
+log "PASS — reconciler stood up a healthy CPU backend, published its endpoint, and cleaned up on delete"
diff --git a/pkg/adapters/backend/lmcache.go b/pkg/adapters/backend/lmcache.go
index 8f98d34..9257f59 100644
--- a/pkg/adapters/backend/lmcache.go
+++ b/pkg/adapters/backend/lmcache.go
@@ -2,6 +2,7 @@ package backend
 
 import (
 	"fmt"
+	"strings"
 
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
@@ -28,6 +29,16 @@ const (
 	defaultLMCacheMaxLocalCPUSize = "20"
 	defaultHFTokenSecretName      = "hf-token"
 
+	// CPU profile (backendConfig profile=cpu) defaults: a GPU-free vLLM engine for
+	// substrate validation off-GPU. It keeps prefix caching + the KV-event publisher
+	// but drops the LMCache connector — real LMCache offload requires a GPU, so the
+	// default (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local.
+	// The CPU image is arch-tagged upstream (latest-arm64/latest-x86_64); the bare
+	// :latest default is meant to be overridden via backendConfig.image per host arch.
+	defaultCPUImage        = "vllm/vllm-openai-cpu:latest"
+	defaultCPUModel        = "Qwen/Qwen2.5-0.5B-Instruct"
+	defaultCPUKVCacheSpace = "4"
+
 	// API-server pod defaults for the two override fields that are server-defaulted.
 	// Baking them into the rendered template keeps the update path churn-free (the
 	// reconciled value matches the live, defaulted object).
@@ -38,6 +49,11 @@ const (
 	cfgKeyImage         = "image"
 	cfgKeyModel         = "model"
 	cfgKeyHFTokenSecret = "hfTokenSecret"
+	cfgKeyProfile       = "profile"
+
+	// profile values for the profile backendConfig key.
+	profileGPU = "gpu"
+	profileCPU = "cpu"
 
 	portHTTP     = 8000
 	portKVEvents = 5557
@@ -66,8 +82,6 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) {
 	namespace := cb.Namespace
 	cfg := cb.Spec.BackendConfig
 
-	image := configOr(cfg, cfgKeyImage, defaultLMCacheImage)
-	model := configOr(cfg, cfgKeyModel, defaultLMCacheModel)
 	hfSecret := configOr(cfg, cfgKeyHFTokenSecret, defaultHFTokenSecretName)
 
 	replicas := int32(1)
@@ -78,60 +92,18 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) {
 	selector := selectorLabels(name)
 	podLabels := podTemplateLabels(name)
 
-	container := corev1.Container{
-		Name:            "vllm",
-		Image:           image,
-		ImagePullPolicy: corev1.PullIfNotPresent,
-		Command:         []string{"vllm", "serve", model},
-		Args: []string{
-			fmt.Sprintf("--port=%d", portHTTP),
-			"--enable-prefix-caching",
-			"--kv-transfer-config", kvTransferConfig,
-			"--kv-events-config", kvEventsConfig,
-		},
-		Env: []corev1.EnvVar{
-			{Name: "VLLM_USE_V1", Value: "1"},
-			{Name: "LMCACHE_CHUNK_SIZE", Value: defaultLMCacheChunkSize},
-			{Name: "LMCACHE_LOCAL_CPU", Value: defaultLMCacheLocalCPU},
-			{Name: "LMCACHE_MAX_LOCAL_CPU_SIZE", Value: defaultLMCacheMaxLocalCPUSize},
-			{
-				Name: "HF_TOKEN",
-				ValueFrom: &corev1.EnvVarSource{
-					SecretKeyRef: &corev1.SecretKeySelector{
-						LocalObjectReference: corev1.LocalObjectReference{Name: hfSecret},
-						Key:                  "token",
-						// Optional so ungated models run without the secret present;
-						// the A2 reference requires it for the gated default model.
-						Optional: ptrTo(true),
-					},
-				},
-			},
-		},
-		Ports: []corev1.ContainerPort{
-			{Name: "http", ContainerPort: portHTTP, Protocol: corev1.ProtocolTCP},
-			{Name: "kv-events", ContainerPort: portKVEvents, Protocol: corev1.ProtocolTCP},
-			{Name: "kv-replay", ContainerPort: portKVReplay, Protocol: corev1.ProtocolTCP},
-		},
-		Resources: corev1.ResourceRequirements{
-			Limits: corev1.ResourceList{
-				"nvidia.com/gpu": resource.MustParse("1"),
-			},
-		},
-		ReadinessProbe: &corev1.Probe{
-			ProbeHandler: corev1.ProbeHandler{
-				HTTPGet: &corev1.HTTPGetAction{
-					Path: "/health",
-					Port: intstr.FromString("http"),
-				},
-			},
-			InitialDelaySeconds: 60,
-			PeriodSeconds:       10,
-			FailureThreshold:    60,
-		},
-		VolumeMounts: []corev1.VolumeMount{
-			{Name: "cache-home", MountPath: "/root/.cache/huggingface"},
-			{Name: "shm", MountPath: "/dev/shm"},
-		},
+	var container corev1.Container
+	var shmSize resource.Quantity
+	if strings.EqualFold(configOr(cfg, cfgKeyProfile, profileGPU), profileCPU) {
+		image := configOr(cfg, cfgKeyImage, defaultCPUImage)
+		model := configOr(cfg, cfgKeyModel, defaultCPUModel)
+		container = cpuEngineContainer(image, model, hfSecret)
+		shmSize = resource.MustParse("4Gi")
+	} else {
+		image := configOr(cfg, cfgKeyImage, defaultLMCacheImage)
+		model := configOr(cfg, cfgKeyModel, defaultLMCacheModel)
+		container = lmCacheEngineContainer(image, model, hfSecret)
+		shmSize = resource.MustParse("8Gi")
 	}
 
 	podSpec := corev1.PodSpec{
@@ -146,7 +118,7 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) {
 				VolumeSource: corev1.VolumeSource{
 					EmptyDir: &corev1.EmptyDirVolumeSource{
 						Medium:    corev1.StorageMediumMemory,
-						SizeLimit: ptrQuantity(resource.MustParse("8Gi")),
+						SizeLimit: ptrQuantity(shmSize),
 					},
 				},
 			},
@@ -194,6 +166,94 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) {
 	}, nil
 }
 
+// lmCacheEngineContainer renders the GPU vLLM+LMCache container (default profile):
+// vLLM reads/writes KV through the LMCache connector, with prefix caching and the
+// KV-event publisher enabled.
+func lmCacheEngineContainer(image, model, hfSecret string) corev1.Container {
+	c := baseEngineContainer(image, model, hfSecret)
+	c.Args = []string{
+		fmt.Sprintf("--port=%d", portHTTP),
+		"--enable-prefix-caching",
+		"--kv-transfer-config", kvTransferConfig,
+		"--kv-events-config", kvEventsConfig,
+	}
+	c.Env = append([]corev1.EnvVar{
+		{Name: "VLLM_USE_V1", Value: "1"},
+		{Name: "LMCACHE_CHUNK_SIZE", Value: defaultLMCacheChunkSize},
+		{Name: "LMCACHE_LOCAL_CPU", Value: defaultLMCacheLocalCPU},
+		{Name: "LMCACHE_MAX_LOCAL_CPU_SIZE", Value: defaultLMCacheMaxLocalCPUSize},
+	}, hfTokenEnv(hfSecret))
+	c.Resources = corev1.ResourceRequirements{
+		Limits: corev1.ResourceList{"nvidia.com/gpu": resource.MustParse("1")},
+	}
+	return c
+}
+
+// cpuEngineContainer renders a GPU-free vLLM container (profile=cpu): no GPU limit
+// and no LMCache connector, but prefix caching and the KV-event publisher stay on so
+// the substrate (engine config + KV-event stream) can be validated off-GPU.
+func cpuEngineContainer(image, model, hfSecret string) corev1.Container {
+	c := baseEngineContainer(image, model, hfSecret)
+	c.Args = []string{
+		fmt.Sprintf("--port=%d", portHTTP),
+		"--dtype=bfloat16",
+		"--max-model-len=8192",
+		"--enforce-eager",
+		"--enable-prefix-caching",
+		"--kv-events-config", kvEventsConfig,
+	}
+	c.Env = append([]corev1.EnvVar{
+		{Name: "VLLM_CPU_KVCACHE_SPACE", Value: defaultCPUKVCacheSpace},
+	}, hfTokenEnv(hfSecret))
+	return c
+}
+
+// baseEngineContainer holds the parts shared by every profile (name, image,
+// command, ports, readiness probe, mounts); args/env/resources are profile-specific.
+func baseEngineContainer(image, model, hfSecret string) corev1.Container {
+	return corev1.Container{
+		Name:            "vllm",
+		Image:           image,
+		ImagePullPolicy: corev1.PullIfNotPresent,
+		Command:         []string{"vllm", "serve", model},
+		Ports: []corev1.ContainerPort{
+			{Name: "http", ContainerPort: portHTTP, Protocol: corev1.ProtocolTCP},
+			{Name: "kv-events", ContainerPort: portKVEvents, Protocol: corev1.ProtocolTCP},
+			{Name: "kv-replay", ContainerPort: portKVReplay, Protocol: corev1.ProtocolTCP},
+		},
+		ReadinessProbe: &corev1.Probe{
+			ProbeHandler: corev1.ProbeHandler{
+				HTTPGet: &corev1.HTTPGetAction{
+					Path: "/health",
+					Port: intstr.FromString("http"),
+				},
+			},
+			InitialDelaySeconds: 60,
+			PeriodSeconds:       10,
+			FailureThreshold:    60,
+		},
+		VolumeMounts: []corev1.VolumeMount{
+			{Name: "cache-home", MountPath: "/root/.cache/huggingface"},
+			{Name: "shm", MountPath: "/dev/shm"},
+		},
+	}
+}
+
+// hfTokenEnv injects the optional HF_TOKEN secret ref so gated models can pull; it
+// is optional so ungated models (e.g. the CPU profile default) run without it.
+func hfTokenEnv(secret string) corev1.EnvVar {
+	return corev1.EnvVar{
+		Name: "HF_TOKEN",
+		ValueFrom: &corev1.EnvVarSource{
+			SecretKeyRef: &corev1.SecretKeySelector{
+				LocalObjectReference: corev1.LocalObjectReference{Name: secret},
+				Key:                  "token",
+				Optional:             ptrTo(true),
+			},
+		},
+	}
+}
+
 // selectorLabels are the immutable identity labels for a backend's child objects.
 func selectorLabels(name string) map[string]string {
 	return map[string]string{
diff --git a/pkg/adapters/backend/lmcache_test.go b/pkg/adapters/backend/lmcache_test.go
index 1ff289d..ab5af24 100644
--- a/pkg/adapters/backend/lmcache_test.go
+++ b/pkg/adapters/backend/lmcache_test.go
@@ -126,6 +126,117 @@ func TestLMCacheBuildNil(t *testing.T) {
 	}
 }
 
+func TestLMCacheBuildCPUProfile(t *testing.T) {
+	cb := &cachev1alpha1.CacheBackend{
+		ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"},
+		Spec: cachev1alpha1.CacheBackendSpec{
+			Type:          cachev1alpha1.CacheBackendTypeLMCache,
+			BackendConfig: map[string]string{cfgKeyProfile: "cpu"},
+		},
+	}
+	b, _ := For(cachev1alpha1.CacheBackendTypeLMCache)
+	w, err := b.Build(cb)
+	if err != nil {
+		t.Fatalf("build: %v", err)
+	}
+
+	c := w.Deployment.Spec.Template.Spec.Containers[0]
+	if c.Image != defaultCPUImage {
+		t.Fatalf("image = %q, want CPU default %q", c.Image, defaultCPUImage)
+	}
+	if c.Command[len(c.Command)-1] != defaultCPUModel {
+		t.Fatalf("model = %v, want CPU default %q", c.Command, defaultCPUModel)
+	}
+	// No GPU limit on the CPU profile.
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
+		t.Fatalf("CPU profile must not request a GPU: %v", c.Resources.Limits)
+	}
+	// LMCache connector is dropped; prefix caching + KV events stay on.
+	if argsContain(c.Args, "--kv-transfer-config") {
+		t.Fatalf("CPU profile must not set the LMCache connector: %v", c.Args)
+	}
+	if !argsContain(c.Args, "--enable-prefix-caching") || !argsContain(c.Args, "--kv-events-config") || !argsContain(c.Args, "--enforce-eager") {
+		t.Fatalf("CPU profile args missing expected flags: %v", c.Args)
+	}
+	// CPU env, not the LMCache/GPU env.
+	if findEnv(c.Env, "VLLM_CPU_KVCACHE_SPACE") == nil {
+		t.Fatalf("CPU profile missing VLLM_CPU_KVCACHE_SPACE: %v", c.Env)
+	}
+	if findEnv(c.Env, "VLLM_USE_V1") != nil || findEnv(c.Env, "LMCACHE_CHUNK_SIZE") != nil {
+		t.Fatalf("CPU profile must not carry LMCache/GPU env: %v", c.Env)
+	}
+	// HF_TOKEN still optional (for overridden gated models).
+	if hf := findEnv(c.Env, "HF_TOKEN"); hf == nil || hf.ValueFrom == nil || hf.ValueFrom.SecretKeyRef.Optional == nil || !*hf.ValueFrom.SecretKeyRef.Optional {
+		t.Fatalf("HF_TOKEN should remain an optional secret ref: %+v", hf)
+	}
+	// Same wiring as GPU: 3 ports + readiness probe.
+	if len(c.Ports) != 3 || c.ReadinessProbe == nil {
+		t.Fatalf("CPU profile lost ports/probe: ports=%d probe=%v", len(c.Ports), c.ReadinessProbe)
+	}
+}
+
+func TestLMCacheBuildCPUProfileOverrides(t *testing.T) {
+	cb := &cachev1alpha1.CacheBackend{
+		ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"},
+		Spec: cachev1alpha1.CacheBackendSpec{
+			Type: cachev1alpha1.CacheBackendTypeLMCache,
+			BackendConfig: map[string]string{
+				cfgKeyProfile: "CPU", // case-insensitive
+				cfgKeyImage:   "vllm/vllm-openai-cpu:latest-arm64",
+				cfgKeyModel:   "org/tiny",
+			},
+		},
+	}
+	b, _ := For(cachev1alpha1.CacheBackendTypeLMCache)
+	w, err := b.Build(cb)
+	if err != nil {
+		t.Fatalf("build: %v", err)
+	}
+	c := w.Deployment.Spec.Template.Spec.Containers[0]
+	if c.Image != "vllm/vllm-openai-cpu:latest-arm64" {
+		t.Fatalf("image = %q, want CPU override", c.Image)
+	}
+	if c.Command[len(c.Command)-1] != "org/tiny" {
+		t.Fatalf("model = %v, want override", c.Command)
+	}
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
+		t.Fatalf("CPU profile must not request a GPU")
+	}
+}
+
+func TestLMCacheBuildDefaultProfileIsGPU(t *testing.T) {
+	for _, profile := range []string{"", "gpu", "unknown"} {
+		cb := &cachev1alpha1.CacheBackend{
+			ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"},
+			Spec: cachev1alpha1.CacheBackendSpec{
+				Type:          cachev1alpha1.CacheBackendTypeLMCache,
+				BackendConfig: map[string]string{cfgKeyProfile: profile},
+			},
+		}
+		b, _ := For(cachev1alpha1.CacheBackendTypeLMCache)
+		w, err := b.Build(cb)
+		if err != nil {
+			t.Fatalf("build profile=%q: %v", profile, err)
+		}
+		c := w.Deployment.Spec.Template.Spec.Containers[0]
+		if gpu := c.Resources.Limits["nvidia.com/gpu"]; gpu.Value() != 1 {
+			t.Fatalf("profile=%q should default to GPU (gpu limit=%v)", profile, gpu.Value())
+		}
+		if !argsContain(c.Args, "--kv-transfer-config") {
+			t.Fatalf("profile=%q should keep the LMCache connector", profile)
+		}
+	}
+}
+
+func argsContain(args []string, want string) bool {
+	for _, a := range args {
+		if a == want {
+			return true
+		}
+	}
+	return false
+}
+
 func findEnv(env []corev1.EnvVar, name string) *corev1.EnvVar {
 	for i := range env {
 		if env[i].Name == name {

From 0b911da788a55e0611b24adf589eecaf5d3549a1 Mon Sep 17 00:00:00 2001
From: Edward Sun <sunxu.edward@gmail.com>
Date: Wed, 27 May 2026 14:43:35 -0700
Subject: [PATCH 2/3] Reconcile profile-owned fields on update; require image
 for cpu profile

Address review feedback on the CPU profile:
- The update path now reconciles the container Resources (the GPU limit differs by
  profile) and the pod Volumes (the shm size differs), so switching an existing
  backend gpu<->cpu actually reaches the live Deployment instead of only taking
  effect on create. Both fields are builder-owned and not API-server-defaulted, so
  reconciling them stays churn-free. Adds a controller test for the profile switch.
- profile=cpu now requires backendConfig.image: the upstream CPU image is arch-
  tagged (latest-arm64 / latest-x86_64) with no safe multi-arch default, so the
  builder errors rather than rendering a bogus :latest. Tests + design doc updated.
---
 docs/design/cachebackend-api.md               |  2 +-
 .../controller/cachebackend_controller.go     |  7 ++++
 .../cachebackend_controller_test.go           | 38 +++++++++++++++++++
 pkg/adapters/backend/lmcache.go               | 20 ++++++----
 pkg/adapters/backend/lmcache_test.go          | 25 ++++++++++--
 5 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/docs/design/cachebackend-api.md b/docs/design/cachebackend-api.md
index 6321396..854320f 100644
--- a/docs/design/cachebackend-api.md
+++ b/docs/design/cachebackend-api.md
@@ -58,7 +58,7 @@ It intentionally does not expose `containers`; requiring users to provide contai
 
 | Key | Default | Purpose |
 |---|---|---|
-| `image` | profile-dependent | Container image for the backend engine. |
+| `image` | gpu: lmcache reference image; cpu: **required** | Container image for the backend engine. The CPU image is arch-tagged upstream with no safe multi-arch default, so `profile=cpu` requires an explicit image. |
 | `model` | profile-dependent | Model the engine serves (`vllm serve <model>`). |
 | `hfTokenSecret` | `hf-token` | Name of the Secret (key `token`) injected as `HF_TOKEN` for gated model pulls. The reference is optional, so ungated models run without it. |
 | `profile` | `gpu` | Rendering profile. `gpu` (default): the full vLLM + LMCache connector with prefix caching, KV events, and an `nvidia.com/gpu` limit. `cpu`: a GPU-free vLLM engine (no GPU limit, no LMCache connector) that keeps prefix caching + the KV-event publisher, for validating the substrate off-GPU. Real LMCache offload requires a GPU, so it stays on the `gpu` profile. |
diff --git a/internal/controller/cachebackend_controller.go b/internal/controller/cachebackend_controller.go
index 4a5ed0a..a827b15 100644
--- a/internal/controller/cachebackend_controller.go
+++ b/internal/controller/cachebackend_controller.go
@@ -189,6 +189,10 @@ func (r *CacheBackendReconciler) applyService(ctx context.Context, backend *cach
 func reconcileManagedPodSpec(live *corev1.PodSpec, desired *corev1.PodSpec) {
 	reconcileManagedContainer(live, desired)
 
+	// Volumes are builder-owned (e.g. the shm size differs by profile) and are not
+	// API-server-defaulted in a Deployment template, so copying them is churn-free.
+	live.Volumes = desired.Volumes
+
 	live.NodeSelector = desired.NodeSelector
 	live.Affinity = desired.Affinity
 	live.Tolerations = desired.Tolerations
@@ -215,6 +219,9 @@ func reconcileManagedContainer(live *corev1.PodSpec, desired *corev1.PodSpec) {
 			live.Containers[i].Command = want.Command
 			live.Containers[i].Args = want.Args
 			live.Containers[i].Env = want.Env
+			// Resources are builder-owned (the GPU limit differs by profile) and not
+			// API-server-defaulted, so reconciling them is churn-free.
+			live.Containers[i].Resources = want.Resources
 			return
 		}
 	}
diff --git a/internal/controller/cachebackend_controller_test.go b/internal/controller/cachebackend_controller_test.go
index d6a6cfd..2c569f0 100644
--- a/internal/controller/cachebackend_controller_test.go
+++ b/internal/controller/cachebackend_controller_test.go
@@ -218,6 +218,44 @@ func TestReconcileLMCacheUpdatesImage(t *testing.T) {
 	}
 }
 
+func TestReconcileLMCacheProfileSwitchGPUToCPU(t *testing.T) {
+	scheme := newScheme(t)
+	r := newReconciler(scheme, lmcacheBackend("cache", "ns1"))
+
+	reconcile(t, r, "cache", "ns1")
+	// GPU profile (default): GPU limit set, 8Gi shm.
+	c := getDeployment(t, r, "cache", "ns1").Spec.Template.Spec.Containers[0]
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; !ok {
+		t.Fatalf("default profile should request a GPU")
+	}
+
+	live := getBackend(t, r, "cache", "ns1")
+	live.Spec.BackendConfig = map[string]string{"profile": "cpu", "image": "vllm/vllm-openai-cpu:latest-arm64"}
+	if err := r.Update(context.Background(), live); err != nil {
+		t.Fatalf("switch to cpu profile: %v", err)
+	}
+	reconcile(t, r, "cache", "ns1")
+
+	// CPU profile must reach the live Deployment: GPU limit gone, image swapped, shm 4Gi.
+	dep := getDeployment(t, r, "cache", "ns1")
+	c = dep.Spec.Template.Spec.Containers[0]
+	if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
+		t.Fatalf("GPU limit should be removed after switching to cpu profile")
+	}
+	if c.Image != "vllm/vllm-openai-cpu:latest-arm64" {
+		t.Fatalf("image = %q, want cpu image after profile switch", c.Image)
+	}
+	var shm *corev1.Volume
+	for i := range dep.Spec.Template.Spec.Volumes {
+		if dep.Spec.Template.Spec.Volumes[i].Name == "shm" {
+			shm = &dep.Spec.Template.Spec.Volumes[i]
+		}
+	}
+	if shm == nil || shm.EmptyDir == nil || shm.EmptyDir.SizeLimit == nil || shm.EmptyDir.SizeLimit.String() != "4Gi" {
+		t.Fatalf("shm size = %v, want 4Gi after switching to cpu profile", shm)
+	}
+}
+
 func TestReconcileLMCacheScalesReplicas(t *testing.T) {
 	scheme := newScheme(t)
 	cb := lmcacheBackend("cache", "ns1")
diff --git a/pkg/adapters/backend/lmcache.go b/pkg/adapters/backend/lmcache.go
index 9257f59..7cdb633 100644
--- a/pkg/adapters/backend/lmcache.go
+++ b/pkg/adapters/backend/lmcache.go
@@ -29,13 +29,12 @@ const (
 	defaultLMCacheMaxLocalCPUSize = "20"
 	defaultHFTokenSecretName      = "hf-token"
 
-	// CPU profile (backendConfig profile=cpu) defaults: a GPU-free vLLM engine for
-	// substrate validation off-GPU. It keeps prefix caching + the KV-event publisher
-	// but drops the LMCache connector — real LMCache offload requires a GPU, so the
-	// default (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local.
-	// The CPU image is arch-tagged upstream (latest-arm64/latest-x86_64); the bare
-	// :latest default is meant to be overridden via backendConfig.image per host arch.
-	defaultCPUImage        = "vllm/vllm-openai-cpu:latest"
+	// CPU profile (backendConfig profile=cpu): a GPU-free vLLM engine for substrate
+	// validation off-GPU. It keeps prefix caching + the KV-event publisher but drops
+	// the LMCache connector — real LMCache offload requires a GPU, so the default
+	// (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local.
+	// The upstream CPU image is arch-tagged (latest-arm64 / latest-x86_64) with no
+	// safe multi-arch default, so backendConfig.image is REQUIRED for this profile.
 	defaultCPUModel        = "Qwen/Qwen2.5-0.5B-Instruct"
 	defaultCPUKVCacheSpace = "4"
 
@@ -95,7 +94,12 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) {
 	var container corev1.Container
 	var shmSize resource.Quantity
 	if strings.EqualFold(configOr(cfg, cfgKeyProfile, profileGPU), profileCPU) {
-		image := configOr(cfg, cfgKeyImage, defaultCPUImage)
+		// The CPU image is arch-tagged upstream with no safe multi-arch default,
+		// so it must be supplied explicitly (e.g. vllm/vllm-openai-cpu:latest-arm64).
+		image := configOr(cfg, cfgKeyImage, "")
+		if image == "" {
+			return nil, fmt.Errorf("backendConfig.profile=cpu requires backendConfig.image (an arch-tagged CPU image, e.g. vllm/vllm-openai-cpu:latest-arm64)")
+		}
 		model := configOr(cfg, cfgKeyModel, defaultCPUModel)
 		container = cpuEngineContainer(image, model, hfSecret)
 		shmSize = resource.MustParse("4Gi")
diff --git a/pkg/adapters/backend/lmcache_test.go b/pkg/adapters/backend/lmcache_test.go
index ab5af24..26e34aa 100644
--- a/pkg/adapters/backend/lmcache_test.go
+++ b/pkg/adapters/backend/lmcache_test.go
@@ -130,8 +130,11 @@ func TestLMCacheBuildCPUProfile(t *testing.T) {
 	cb := &cachev1alpha1.CacheBackend{
 		ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"},
 		Spec: cachev1alpha1.CacheBackendSpec{
-			Type:          cachev1alpha1.CacheBackendTypeLMCache,
-			BackendConfig: map[string]string{cfgKeyProfile: "cpu"},
+			Type: cachev1alpha1.CacheBackendTypeLMCache,
+			BackendConfig: map[string]string{
+				cfgKeyProfile: "cpu",
+				cfgKeyImage:   "vllm/vllm-openai-cpu:latest-arm64",
+			},
 		},
 	}
 	b, _ := For(cachev1alpha1.CacheBackendTypeLMCache)
@@ -141,8 +144,8 @@ func TestLMCacheBuildCPUProfile(t *testing.T) {
 	}
 
 	c := w.Deployment.Spec.Template.Spec.Containers[0]
-	if c.Image != defaultCPUImage {
-		t.Fatalf("image = %q, want CPU default %q", c.Image, defaultCPUImage)
+	if c.Image != "vllm/vllm-openai-cpu:latest-arm64" {
+		t.Fatalf("image = %q, want the supplied CPU image", c.Image)
 	}
 	if c.Command[len(c.Command)-1] != defaultCPUModel {
 		t.Fatalf("model = %v, want CPU default %q", c.Command, defaultCPUModel)
@@ -204,6 +207,20 @@ func TestLMCacheBuildCPUProfileOverrides(t *testing.T) {
 	}
 }
 
+func TestLMCacheBuildCPUProfileRequiresImage(t *testing.T) {
+	cb := &cachev1alpha1.CacheBackend{
+		ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"},
+		Spec: cachev1alpha1.CacheBackendSpec{
+			Type:          cachev1alpha1.CacheBackendTypeLMCache,
+			BackendConfig: map[string]string{cfgKeyProfile: "cpu"}, // no image
+		},
+	}
+	b, _ := For(cachev1alpha1.CacheBackendTypeLMCache)
+	if _, err := b.Build(cb); err == nil {
+		t.Fatalf("expected an error: profile=cpu without an image has no safe default")
+	}
+}
+
 func TestLMCacheBuildDefaultProfileIsGPU(t *testing.T) {
 	for _, profile := range []string{"", "gpu", "unknown"} {
 		cb := &cachev1alpha1.CacheBackend{

From aa1c8d4edf0d5e6354082c81107d5114284c956a Mon Sep 17 00:00:00 2001
From: Edward Sun <sunxu.edward@gmail.com>
Date: Wed, 27 May 2026 14:48:17 -0700
Subject: [PATCH 3/3] Make the CPU sample image a non-applyable arch
 placeholder

The CPU image is arch-tagged with no multi-arch tag, so a hardcoded latest-arm64
would fail on x86 hosts. Ship a deliberate :latest-<arch> placeholder (matching
the reference stack's non-applyable-image convention) so applying as-is fails
fast rather than silently running the wrong arch.
---
 config/samples/cachebackend-lmcache-cpu.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/config/samples/cachebackend-lmcache-cpu.yaml b/config/samples/cachebackend-lmcache-cpu.yaml
index 3e76af1..0807f15 100644
--- a/config/samples/cachebackend-lmcache-cpu.yaml
+++ b/config/samples/cachebackend-lmcache-cpu.yaml
@@ -6,7 +6,10 @@
 # the reconciler stand up a healthy, serving backend on a CPU-only cluster (e.g.
 # kind) to validate the engine-config + KV-event path end to end.
 #
-# The CPU image is arch-tagged upstream; use latest-x86_64 on x86 hosts.
+# IMPORTANT: the CPU image is arch-tagged upstream with no multi-arch tag, so the
+# `image` below is a deliberate non-applyable PLACEHOLDER — substitute the tag for
+# your host before applying: `latest-arm64` (arm64) or `latest-x86_64` (x86_64).
+# Applying as-is fails fast (bad tag) rather than silently running the wrong arch.
 apiVersion: inferencecache.io/v1alpha1
 kind: CacheBackend
 metadata:
@@ -22,5 +25,6 @@ spec:
     role: ReadWrite
   backendConfig:
     profile: cpu
-    image: vllm/vllm-openai-cpu:latest-arm64
+    # PLACEHOLDER — replace <arch> with arm64 or x86_64 for your host before applying.
+    image: vllm/vllm-openai-cpu:latest-<arch>
     model: Qwen/Qwen2.5-0.5B-Instruct