From c993dfc177cc9937409ee26f97a775dffd0d7a6e Mon Sep 17 00:00:00 2001 From: Edward Sun Date: Wed, 27 May 2026 14:36:18 -0700 Subject: [PATCH 1/3] Add a CPU profile and a kind-based reconciler CPU canary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets the CacheBackend reconciler stand up a healthy, serving backend without a GPU so the substrate can be validated off-GPU. - backendConfig profile=cpu renders a GPU-free vLLM engine: no nvidia.com/gpu limit and no LMCache connector, but prefix caching and the KV-event publisher stay on, plus CPU flags (--dtype/--max-model-len/--enforce-eager) and VLLM_CPU_KVCACHE_SPACE, defaulting to the vLLM CPU image + a tiny ungated model. The default (gpu) profile is unchanged and still owns real LMCache offload, which requires a GPU. Selection is case-insensitive; unknown/empty falls back to gpu. Covered by unit tests; the GPU rendering is byte-identical. - docs/reference-stack/scripts/canary_c2_reconcile.sh + an on-demand workflow: bring up kind, run the controller, apply a profile=cpu CacheBackend, and assert the reconciler reports Ready with a published endpoint, an engine prefix-cache hit through the Service, and owner-ref GC on delete — the real-pod path envtest can't cover. - A profile=cpu sample and the backendConfig key reference in the design doc. --- .github/workflows/c2-reconciler-canary.yml | 57 ++++++ config/samples/cachebackend-lmcache-cpu.yaml | 26 +++ docs/design/cachebackend-api.md | 11 ++ docs/reference-stack/README.md | 21 +++ .../scripts/canary_c2_reconcile.sh | 148 +++++++++++++++ pkg/adapters/backend/lmcache.go | 174 ++++++++++++------ pkg/adapters/backend/lmcache_test.go | 111 +++++++++++ 7 files changed, 491 insertions(+), 57 deletions(-) create mode 100644 .github/workflows/c2-reconciler-canary.yml create mode 100644 config/samples/cachebackend-lmcache-cpu.yaml create mode 100755 docs/reference-stack/scripts/canary_c2_reconcile.sh diff --git a/.github/workflows/c2-reconciler-canary.yml b/.github/workflows/c2-reconciler-canary.yml new file mode 100644 index 0000000..6084bd4 --- /dev/null +++ b/.github/workflows/c2-reconciler-canary.yml @@ -0,0 +1,57 @@ +# On-demand / scheduled CPU canary for the C2 CacheBackend reconciler. +# +# Runs docs/reference-stack/scripts/canary_c2_reconcile.sh: brings up a kind +# cluster, runs the controller, applies a CPU-profile CacheBackend, and asserts +# the reconciler stands up a healthy serving backend (status.health=Ready, +# endpoint published), an engine prefix-cache hit, and owner-ref GC on delete. +# GPU-free. +# +# This is NOT a per-PR gate (it pulls a multi-GB image, needs Docker + kind, and +# ~10 GiB RAM); it runs on a schedule and on manual dispatch. +name: c2-reconciler-canary + +on: + workflow_dispatch: + inputs: + runner: + description: "Runner label (override to target a self-hosted Docker host)" + default: ubuntu-latest + required: false + schedule: + - cron: "30 7 * * *" # nightly 07:30 UTC + +permissions: + contents: read + +concurrency: + group: c2-reconciler-canary + cancel-in-progress: false + +jobs: + canary: + runs-on: ${{ github.event.inputs.runner || 'ubuntu-latest' }} + timeout-minutes: 40 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Install kind + uses: helm/kind-action@v1 + with: + install_only: true + + - name: Run C2 reconciler CPU canary + run: docs/reference-stack/scripts/canary_c2_reconcile.sh + + - name: Upload canary logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: c2-canary-logs + path: | + /tmp/c2-canary-controller.log + /tmp/c2-canary-pf.log + if-no-files-found: ignore diff --git a/config/samples/cachebackend-lmcache-cpu.yaml b/config/samples/cachebackend-lmcache-cpu.yaml new file mode 100644 index 0000000..3e76af1 --- /dev/null +++ b/config/samples/cachebackend-lmcache-cpu.yaml @@ -0,0 +1,26 @@ +# GPU-free LMCache backend for substrate validation (backendConfig profile=cpu). +# +# Renders a vLLM CPU engine with prefix caching + the KV-cache event publisher, +# but WITHOUT the LMCache connector and WITHOUT a GPU resource request — real +# LMCache offload needs a GPU (use the default gpu profile for that). This lets +# the reconciler stand up a healthy, serving backend on a CPU-only cluster (e.g. +# kind) to validate the engine-config + KV-event path end to end. +# +# The CPU image is arch-tagged upstream; use latest-x86_64 on x86 hosts. +apiVersion: inferencecache.io/v1alpha1 +kind: CacheBackend +metadata: + labels: + app.kubernetes.io/name: inference-cache + name: cachebackend-lmcache-cpu +spec: + type: LMCache + deploymentKind: Deployment + replicas: 1 + integration: + engine: vLLM + role: ReadWrite + backendConfig: + profile: cpu + image: vllm/vllm-openai-cpu:latest-arm64 + model: Qwen/Qwen2.5-0.5B-Instruct diff --git a/docs/design/cachebackend-api.md b/docs/design/cachebackend-api.md index 5d14246..6321396 100644 --- a/docs/design/cachebackend-api.md +++ b/docs/design/cachebackend-api.md @@ -52,6 +52,17 @@ The `v1alpha1` contract must remain backward-compatible where possible. New fiel It intentionally does not expose `containers`; requiring users to provide containers would conflict with managed backend defaults and would make simple scheduling overrides unnecessarily large. +### backendConfig keys (managed LMCache) + +`spec.backendConfig` is a free-form string map; the managed LMCache builder recognizes a few keys as overrides until they are promoted to first-class spec fields: + +| Key | Default | Purpose | +|---|---|---| +| `image` | profile-dependent | Container image for the backend engine. | +| `model` | profile-dependent | Model the engine serves (`vllm serve `). | +| `hfTokenSecret` | `hf-token` | Name of the Secret (key `token`) injected as `HF_TOKEN` for gated model pulls. The reference is optional, so ungated models run without it. | +| `profile` | `gpu` | Rendering profile. `gpu` (default): the full vLLM + LMCache connector with prefix caching, KV events, and an `nvidia.com/gpu` limit. `cpu`: a GPU-free vLLM engine (no GPU limit, no LMCache connector) that keeps prefix caching + the KV-event publisher, for validating the substrate off-GPU. Real LMCache offload requires a GPU, so it stays on the `gpu` profile. | + ## Status | Field | Type | Purpose | diff --git a/docs/reference-stack/README.md b/docs/reference-stack/README.md index 31045d0..cf7f461 100644 --- a/docs/reference-stack/README.md +++ b/docs/reference-stack/README.md @@ -142,6 +142,27 @@ run it after changing the subscriber. --- +## CacheBackend reconciler canary (CPU) + +[`scripts/canary_c2_reconcile.sh`](scripts/canary_c2_reconcile.sh) is a GPU-free, +on-demand canary for the **C2 reconciler**: it brings up a kind cluster, runs the +controller, applies a `CacheBackend` with `backendConfig.profile: cpu`, and asserts +the controller stands up a healthy serving backend (`status.health=Ready`, endpoint +published), an engine prefix-cache hit through the Service, and owner-ref garbage +collection when the CR is deleted. It exercises the reconciler against real pods — +the gap the envtest unit tests can't cover. + +```bash +docs/reference-stack/scripts/canary_c2_reconcile.sh +``` + +Like the full-chain canary it is **on-demand**, not a blocking gate: it needs +Docker + kind, pulls the vLLM CPU image, and wants ~10+ GiB of Docker VM RAM. The +`cpu` profile runs a GPU-free vLLM engine (prefix caching + KV events, no LMCache +offload); real LMCache offload still needs a GPU (the default `gpu` profile). + +--- + ## Teardown ```bash diff --git a/docs/reference-stack/scripts/canary_c2_reconcile.sh b/docs/reference-stack/scripts/canary_c2_reconcile.sh new file mode 100755 index 0000000..c23eba0 --- /dev/null +++ b/docs/reference-stack/scripts/canary_c2_reconcile.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# CPU canary for the C2 CacheBackend reconciler. Proves the controller stands up a +# healthy, serving backend from a CR on a GPU-free cluster (kind): +# +# kubectl apply CacheBackend(profile=cpu) --> controller --> Deployment + Service +# --> CPU vLLM pods become Ready --> status.health=Ready, status.endpoint set +# +# Optionally drives prefix traffic through the Service and checks an engine +# prefix-cache hit. Deleting the CR garbage-collects the children via owner refs. +# +# This exercises the reconciler end to end against real pods — the gap envtest +# can't cover. It uses the CPU profile (no GPU, no LMCache offload); real LMCache +# offload needs a GPU (default profile). +# +# On-demand canary (NOT a per-PR gate): needs Docker + kind + kubectl, pulls the +# multi-GB vLLM CPU image, and a Docker VM with ~10+ GiB RAM (CPU runtime baseline +# ~5 GiB + KV cache). See docs/reference-stack/VERSIONS.md. +# +# Usage: docs/reference-stack/scripts/canary_c2_reconcile.sh +# Tunables via env: IMAGE, MODEL, KIND_CLUSTER, NAMESPACE, READY_TIMEOUT, SKIP_TRAFFIC. +set -euo pipefail + +arch="$(uname -m)" +case "$arch" in + arm64 | aarch64) IMAGE_TAG="${IMAGE_TAG:-latest-arm64}" ;; + *) IMAGE_TAG="${IMAGE_TAG:-latest-x86_64}" ;; +esac +IMAGE="${IMAGE:-vllm/vllm-openai-cpu:$IMAGE_TAG}" +MODEL="${MODEL:-Qwen/Qwen2.5-0.5B-Instruct}" +KIND_CLUSTER="${KIND_CLUSTER:-ic-c2-canary}" +NAMESPACE="${NAMESPACE:-c2-canary}" +CR_NAME="${CR_NAME:-canary}" +READY_TIMEOUT="${READY_TIMEOUT:-900}" # seconds for the CPU model to load + become Ready +SKIP_TRAFFIC="${SKIP_TRAFFIC:-0}" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +cd "$REPO_ROOT" + +KIND="${KIND:-$([ -x ./bin/kind ] && echo ./bin/kind || echo kind)}" +controller_pid="" +pf_pid="" +log() { echo "[c2-canary] $*"; } +fail() { + echo "[c2-canary] FAIL: $*" >&2 + exit 1 +} + +cleanup() { + [ -n "$pf_pid" ] && kill "$pf_pid" 2>/dev/null || true + [ -n "$controller_pid" ] && kill "$controller_pid" 2>/dev/null || true + "$KIND" delete cluster --name "$KIND_CLUSTER" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +# --- cluster ---------------------------------------------------------------- +log "creating kind cluster $KIND_CLUSTER" +"$KIND" create cluster --name "$KIND_CLUSTER" --wait 120s +KUBECONFIG_ARGS=(--context "kind-$KIND_CLUSTER") + +log "pulling CPU image and loading it into the node ($IMAGE)" +docker pull "$IMAGE" +"$KIND" load docker-image "$IMAGE" --name "$KIND_CLUSTER" + +# --- controller ------------------------------------------------------------- +log "installing CRD" +kubectl "${KUBECONFIG_ARGS[@]}" apply -f config/crd/bases/inferencecache.io_cachebackends.yaml + +log "building + starting the controller" +go build -o bin/controller ./cmd/controller +./bin/controller --leader-elect=false >/tmp/c2-canary-controller.log 2>&1 & +controller_pid=$! + +kubectl "${KUBECONFIG_ARGS[@]}" create namespace "$NAMESPACE" + +# --- apply the CacheBackend (CPU profile) ----------------------------------- +log "applying CacheBackend $NAMESPACE/$CR_NAME (profile=cpu, image=$IMAGE)" +kubectl "${KUBECONFIG_ARGS[@]}" apply -f - </dev/null || true)" + if [ "$(date +%s)" -ge "$deadline" ]; then + kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get pods -o wide || true + kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" describe deployment "$CR_NAME" || true + fail "backend did not become Ready within ${READY_TIMEOUT}s (last health='$health')" + fi + sleep 5 +done +log "status.health=Ready" + +endpoint="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get cachebackend "$CR_NAME" -o jsonpath='{.status.endpoint}')" +[ -n "$endpoint" ] || fail "status.endpoint was not published" +log "status.endpoint=$endpoint" + +avail="$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deployment "$CR_NAME" -o jsonpath='{.status.availableReplicas}')" +[ "${avail:-0}" -ge 1 ] || fail "deployment has no available replicas" + +# --- optional: drive prefix traffic + check a cache hit --------------------- +if [ "$SKIP_TRAFFIC" != "1" ]; then + log "port-forwarding the Service to drive prefix traffic" + kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" port-forward "svc/$CR_NAME" 18000:8000 >/tmp/c2-canary-pf.log 2>&1 & + pf_pid=$! + for _ in $(seq 1 30); do + curl -sf -o /dev/null "http://localhost:18000/health" && break + sleep 1 + done + hits() { curl -s "http://localhost:18000/metrics" | awk '/^vllm:prefix_cache_hits_total/{s+=$2} END{print s+0}'; } + PREFIX="$(python3 -c 'print(("You are a meticulous canary assistant. Follow the rules precisely. " * 200).strip())')" + fire() { + curl -s -o /dev/null -w '%{http_code}' "http://localhost:18000/v1/chat/completions" \ + -H 'Content-Type: application/json' \ + -d "$(python3 -c 'import json,sys;print(json.dumps({"model":sys.argv[3],"max_tokens":8,"temperature":0,"messages":[{"role":"system","content":sys.argv[1]},{"role":"user","content":sys.argv[2]}]}))' "$PREFIX" "$1" "$MODEL")" + } + h0=$(hits) + log "request 1 (cold prefix): HTTP $(fire 'summarize in one word')" + log "request 2 (same prefix): HTTP $(fire 'summarize in two words')" + h1=$(hits) + log "prefix_cache_hits: $h0 -> $h1" + [ "$h1" -gt "$h0" ] || fail "no engine prefix-cache hit (hits did not increase)" +fi + +# --- delete the CR -> owner-ref GC ------------------------------------------ +log "deleting the CR; expecting owner-ref GC of the Deployment + Service" +kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" delete cachebackend "$CR_NAME" --wait=true +gc_deadline=$(($(date +%s) + 60)) +until [ "$(kubectl "${KUBECONFIG_ARGS[@]}" -n "$NAMESPACE" get deploy,svc -o name 2>/dev/null | wc -l | tr -d ' ')" = "0" ]; do + [ "$(date +%s)" -lt "$gc_deadline" ] || fail "children were not garbage-collected after CR deletion" + sleep 2 +done + +log "PASS — reconciler stood up a healthy CPU backend, published its endpoint, and cleaned up on delete" diff --git a/pkg/adapters/backend/lmcache.go b/pkg/adapters/backend/lmcache.go index 8f98d34..9257f59 100644 --- a/pkg/adapters/backend/lmcache.go +++ b/pkg/adapters/backend/lmcache.go @@ -2,6 +2,7 @@ package backend import ( "fmt" + "strings" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -28,6 +29,16 @@ const ( defaultLMCacheMaxLocalCPUSize = "20" defaultHFTokenSecretName = "hf-token" + // CPU profile (backendConfig profile=cpu) defaults: a GPU-free vLLM engine for + // substrate validation off-GPU. It keeps prefix caching + the KV-event publisher + // but drops the LMCache connector — real LMCache offload requires a GPU, so the + // default (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local. + // The CPU image is arch-tagged upstream (latest-arm64/latest-x86_64); the bare + // :latest default is meant to be overridden via backendConfig.image per host arch. + defaultCPUImage = "vllm/vllm-openai-cpu:latest" + defaultCPUModel = "Qwen/Qwen2.5-0.5B-Instruct" + defaultCPUKVCacheSpace = "4" + // API-server pod defaults for the two override fields that are server-defaulted. // Baking them into the rendered template keeps the update path churn-free (the // reconciled value matches the live, defaulted object). @@ -38,6 +49,11 @@ const ( cfgKeyImage = "image" cfgKeyModel = "model" cfgKeyHFTokenSecret = "hfTokenSecret" + cfgKeyProfile = "profile" + + // profile values for the profile backendConfig key. + profileGPU = "gpu" + profileCPU = "cpu" portHTTP = 8000 portKVEvents = 5557 @@ -66,8 +82,6 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) { namespace := cb.Namespace cfg := cb.Spec.BackendConfig - image := configOr(cfg, cfgKeyImage, defaultLMCacheImage) - model := configOr(cfg, cfgKeyModel, defaultLMCacheModel) hfSecret := configOr(cfg, cfgKeyHFTokenSecret, defaultHFTokenSecretName) replicas := int32(1) @@ -78,60 +92,18 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) { selector := selectorLabels(name) podLabels := podTemplateLabels(name) - container := corev1.Container{ - Name: "vllm", - Image: image, - ImagePullPolicy: corev1.PullIfNotPresent, - Command: []string{"vllm", "serve", model}, - Args: []string{ - fmt.Sprintf("--port=%d", portHTTP), - "--enable-prefix-caching", - "--kv-transfer-config", kvTransferConfig, - "--kv-events-config", kvEventsConfig, - }, - Env: []corev1.EnvVar{ - {Name: "VLLM_USE_V1", Value: "1"}, - {Name: "LMCACHE_CHUNK_SIZE", Value: defaultLMCacheChunkSize}, - {Name: "LMCACHE_LOCAL_CPU", Value: defaultLMCacheLocalCPU}, - {Name: "LMCACHE_MAX_LOCAL_CPU_SIZE", Value: defaultLMCacheMaxLocalCPUSize}, - { - Name: "HF_TOKEN", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: hfSecret}, - Key: "token", - // Optional so ungated models run without the secret present; - // the A2 reference requires it for the gated default model. - Optional: ptrTo(true), - }, - }, - }, - }, - Ports: []corev1.ContainerPort{ - {Name: "http", ContainerPort: portHTTP, Protocol: corev1.ProtocolTCP}, - {Name: "kv-events", ContainerPort: portKVEvents, Protocol: corev1.ProtocolTCP}, - {Name: "kv-replay", ContainerPort: portKVReplay, Protocol: corev1.ProtocolTCP}, - }, - Resources: corev1.ResourceRequirements{ - Limits: corev1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - ReadinessProbe: &corev1.Probe{ - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Path: "/health", - Port: intstr.FromString("http"), - }, - }, - InitialDelaySeconds: 60, - PeriodSeconds: 10, - FailureThreshold: 60, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "cache-home", MountPath: "/root/.cache/huggingface"}, - {Name: "shm", MountPath: "/dev/shm"}, - }, + var container corev1.Container + var shmSize resource.Quantity + if strings.EqualFold(configOr(cfg, cfgKeyProfile, profileGPU), profileCPU) { + image := configOr(cfg, cfgKeyImage, defaultCPUImage) + model := configOr(cfg, cfgKeyModel, defaultCPUModel) + container = cpuEngineContainer(image, model, hfSecret) + shmSize = resource.MustParse("4Gi") + } else { + image := configOr(cfg, cfgKeyImage, defaultLMCacheImage) + model := configOr(cfg, cfgKeyModel, defaultLMCacheModel) + container = lmCacheEngineContainer(image, model, hfSecret) + shmSize = resource.MustParse("8Gi") } podSpec := corev1.PodSpec{ @@ -146,7 +118,7 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) { VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{ Medium: corev1.StorageMediumMemory, - SizeLimit: ptrQuantity(resource.MustParse("8Gi")), + SizeLimit: ptrQuantity(shmSize), }, }, }, @@ -194,6 +166,94 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) { }, nil } +// lmCacheEngineContainer renders the GPU vLLM+LMCache container (default profile): +// vLLM reads/writes KV through the LMCache connector, with prefix caching and the +// KV-event publisher enabled. +func lmCacheEngineContainer(image, model, hfSecret string) corev1.Container { + c := baseEngineContainer(image, model, hfSecret) + c.Args = []string{ + fmt.Sprintf("--port=%d", portHTTP), + "--enable-prefix-caching", + "--kv-transfer-config", kvTransferConfig, + "--kv-events-config", kvEventsConfig, + } + c.Env = append([]corev1.EnvVar{ + {Name: "VLLM_USE_V1", Value: "1"}, + {Name: "LMCACHE_CHUNK_SIZE", Value: defaultLMCacheChunkSize}, + {Name: "LMCACHE_LOCAL_CPU", Value: defaultLMCacheLocalCPU}, + {Name: "LMCACHE_MAX_LOCAL_CPU_SIZE", Value: defaultLMCacheMaxLocalCPUSize}, + }, hfTokenEnv(hfSecret)) + c.Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"nvidia.com/gpu": resource.MustParse("1")}, + } + return c +} + +// cpuEngineContainer renders a GPU-free vLLM container (profile=cpu): no GPU limit +// and no LMCache connector, but prefix caching and the KV-event publisher stay on so +// the substrate (engine config + KV-event stream) can be validated off-GPU. +func cpuEngineContainer(image, model, hfSecret string) corev1.Container { + c := baseEngineContainer(image, model, hfSecret) + c.Args = []string{ + fmt.Sprintf("--port=%d", portHTTP), + "--dtype=bfloat16", + "--max-model-len=8192", + "--enforce-eager", + "--enable-prefix-caching", + "--kv-events-config", kvEventsConfig, + } + c.Env = append([]corev1.EnvVar{ + {Name: "VLLM_CPU_KVCACHE_SPACE", Value: defaultCPUKVCacheSpace}, + }, hfTokenEnv(hfSecret)) + return c +} + +// baseEngineContainer holds the parts shared by every profile (name, image, +// command, ports, readiness probe, mounts); args/env/resources are profile-specific. +func baseEngineContainer(image, model, hfSecret string) corev1.Container { + return corev1.Container{ + Name: "vllm", + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"vllm", "serve", model}, + Ports: []corev1.ContainerPort{ + {Name: "http", ContainerPort: portHTTP, Protocol: corev1.ProtocolTCP}, + {Name: "kv-events", ContainerPort: portKVEvents, Protocol: corev1.ProtocolTCP}, + {Name: "kv-replay", ContainerPort: portKVReplay, Protocol: corev1.ProtocolTCP}, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/health", + Port: intstr.FromString("http"), + }, + }, + InitialDelaySeconds: 60, + PeriodSeconds: 10, + FailureThreshold: 60, + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "cache-home", MountPath: "/root/.cache/huggingface"}, + {Name: "shm", MountPath: "/dev/shm"}, + }, + } +} + +// hfTokenEnv injects the optional HF_TOKEN secret ref so gated models can pull; it +// is optional so ungated models (e.g. the CPU profile default) run without it. +func hfTokenEnv(secret string) corev1.EnvVar { + return corev1.EnvVar{ + Name: "HF_TOKEN", + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: secret}, + Key: "token", + Optional: ptrTo(true), + }, + }, + } +} + // selectorLabels are the immutable identity labels for a backend's child objects. func selectorLabels(name string) map[string]string { return map[string]string{ diff --git a/pkg/adapters/backend/lmcache_test.go b/pkg/adapters/backend/lmcache_test.go index 1ff289d..ab5af24 100644 --- a/pkg/adapters/backend/lmcache_test.go +++ b/pkg/adapters/backend/lmcache_test.go @@ -126,6 +126,117 @@ func TestLMCacheBuildNil(t *testing.T) { } } +func TestLMCacheBuildCPUProfile(t *testing.T) { + cb := &cachev1alpha1.CacheBackend{ + ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"}, + Spec: cachev1alpha1.CacheBackendSpec{ + Type: cachev1alpha1.CacheBackendTypeLMCache, + BackendConfig: map[string]string{cfgKeyProfile: "cpu"}, + }, + } + b, _ := For(cachev1alpha1.CacheBackendTypeLMCache) + w, err := b.Build(cb) + if err != nil { + t.Fatalf("build: %v", err) + } + + c := w.Deployment.Spec.Template.Spec.Containers[0] + if c.Image != defaultCPUImage { + t.Fatalf("image = %q, want CPU default %q", c.Image, defaultCPUImage) + } + if c.Command[len(c.Command)-1] != defaultCPUModel { + t.Fatalf("model = %v, want CPU default %q", c.Command, defaultCPUModel) + } + // No GPU limit on the CPU profile. + if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok { + t.Fatalf("CPU profile must not request a GPU: %v", c.Resources.Limits) + } + // LMCache connector is dropped; prefix caching + KV events stay on. + if argsContain(c.Args, "--kv-transfer-config") { + t.Fatalf("CPU profile must not set the LMCache connector: %v", c.Args) + } + if !argsContain(c.Args, "--enable-prefix-caching") || !argsContain(c.Args, "--kv-events-config") || !argsContain(c.Args, "--enforce-eager") { + t.Fatalf("CPU profile args missing expected flags: %v", c.Args) + } + // CPU env, not the LMCache/GPU env. + if findEnv(c.Env, "VLLM_CPU_KVCACHE_SPACE") == nil { + t.Fatalf("CPU profile missing VLLM_CPU_KVCACHE_SPACE: %v", c.Env) + } + if findEnv(c.Env, "VLLM_USE_V1") != nil || findEnv(c.Env, "LMCACHE_CHUNK_SIZE") != nil { + t.Fatalf("CPU profile must not carry LMCache/GPU env: %v", c.Env) + } + // HF_TOKEN still optional (for overridden gated models). + if hf := findEnv(c.Env, "HF_TOKEN"); hf == nil || hf.ValueFrom == nil || hf.ValueFrom.SecretKeyRef.Optional == nil || !*hf.ValueFrom.SecretKeyRef.Optional { + t.Fatalf("HF_TOKEN should remain an optional secret ref: %+v", hf) + } + // Same wiring as GPU: 3 ports + readiness probe. + if len(c.Ports) != 3 || c.ReadinessProbe == nil { + t.Fatalf("CPU profile lost ports/probe: ports=%d probe=%v", len(c.Ports), c.ReadinessProbe) + } +} + +func TestLMCacheBuildCPUProfileOverrides(t *testing.T) { + cb := &cachev1alpha1.CacheBackend{ + ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"}, + Spec: cachev1alpha1.CacheBackendSpec{ + Type: cachev1alpha1.CacheBackendTypeLMCache, + BackendConfig: map[string]string{ + cfgKeyProfile: "CPU", // case-insensitive + cfgKeyImage: "vllm/vllm-openai-cpu:latest-arm64", + cfgKeyModel: "org/tiny", + }, + }, + } + b, _ := For(cachev1alpha1.CacheBackendTypeLMCache) + w, err := b.Build(cb) + if err != nil { + t.Fatalf("build: %v", err) + } + c := w.Deployment.Spec.Template.Spec.Containers[0] + if c.Image != "vllm/vllm-openai-cpu:latest-arm64" { + t.Fatalf("image = %q, want CPU override", c.Image) + } + if c.Command[len(c.Command)-1] != "org/tiny" { + t.Fatalf("model = %v, want override", c.Command) + } + if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok { + t.Fatalf("CPU profile must not request a GPU") + } +} + +func TestLMCacheBuildDefaultProfileIsGPU(t *testing.T) { + for _, profile := range []string{"", "gpu", "unknown"} { + cb := &cachev1alpha1.CacheBackend{ + ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"}, + Spec: cachev1alpha1.CacheBackendSpec{ + Type: cachev1alpha1.CacheBackendTypeLMCache, + BackendConfig: map[string]string{cfgKeyProfile: profile}, + }, + } + b, _ := For(cachev1alpha1.CacheBackendTypeLMCache) + w, err := b.Build(cb) + if err != nil { + t.Fatalf("build profile=%q: %v", profile, err) + } + c := w.Deployment.Spec.Template.Spec.Containers[0] + if gpu := c.Resources.Limits["nvidia.com/gpu"]; gpu.Value() != 1 { + t.Fatalf("profile=%q should default to GPU (gpu limit=%v)", profile, gpu.Value()) + } + if !argsContain(c.Args, "--kv-transfer-config") { + t.Fatalf("profile=%q should keep the LMCache connector", profile) + } + } +} + +func argsContain(args []string, want string) bool { + for _, a := range args { + if a == want { + return true + } + } + return false +} + func findEnv(env []corev1.EnvVar, name string) *corev1.EnvVar { for i := range env { if env[i].Name == name { From 0b911da788a55e0611b24adf589eecaf5d3549a1 Mon Sep 17 00:00:00 2001 From: Edward Sun Date: Wed, 27 May 2026 14:43:35 -0700 Subject: [PATCH 2/3] Reconcile profile-owned fields on update; require image for cpu profile Address review feedback on the CPU profile: - The update path now reconciles the container Resources (the GPU limit differs by profile) and the pod Volumes (the shm size differs), so switching an existing backend gpu<->cpu actually reaches the live Deployment instead of only taking effect on create. Both fields are builder-owned and not API-server-defaulted, so reconciling them stays churn-free. Adds a controller test for the profile switch. - profile=cpu now requires backendConfig.image: the upstream CPU image is arch- tagged (latest-arm64 / latest-x86_64) with no safe multi-arch default, so the builder errors rather than rendering a bogus :latest. Tests + design doc updated. --- docs/design/cachebackend-api.md | 2 +- .../controller/cachebackend_controller.go | 7 ++++ .../cachebackend_controller_test.go | 38 +++++++++++++++++++ pkg/adapters/backend/lmcache.go | 20 ++++++---- pkg/adapters/backend/lmcache_test.go | 25 ++++++++++-- 5 files changed, 79 insertions(+), 13 deletions(-) diff --git a/docs/design/cachebackend-api.md b/docs/design/cachebackend-api.md index 6321396..854320f 100644 --- a/docs/design/cachebackend-api.md +++ b/docs/design/cachebackend-api.md @@ -58,7 +58,7 @@ It intentionally does not expose `containers`; requiring users to provide contai | Key | Default | Purpose | |---|---|---| -| `image` | profile-dependent | Container image for the backend engine. | +| `image` | gpu: lmcache reference image; cpu: **required** | Container image for the backend engine. The CPU image is arch-tagged upstream with no safe multi-arch default, so `profile=cpu` requires an explicit image. | | `model` | profile-dependent | Model the engine serves (`vllm serve `). | | `hfTokenSecret` | `hf-token` | Name of the Secret (key `token`) injected as `HF_TOKEN` for gated model pulls. The reference is optional, so ungated models run without it. | | `profile` | `gpu` | Rendering profile. `gpu` (default): the full vLLM + LMCache connector with prefix caching, KV events, and an `nvidia.com/gpu` limit. `cpu`: a GPU-free vLLM engine (no GPU limit, no LMCache connector) that keeps prefix caching + the KV-event publisher, for validating the substrate off-GPU. Real LMCache offload requires a GPU, so it stays on the `gpu` profile. | diff --git a/internal/controller/cachebackend_controller.go b/internal/controller/cachebackend_controller.go index 4a5ed0a..a827b15 100644 --- a/internal/controller/cachebackend_controller.go +++ b/internal/controller/cachebackend_controller.go @@ -189,6 +189,10 @@ func (r *CacheBackendReconciler) applyService(ctx context.Context, backend *cach func reconcileManagedPodSpec(live *corev1.PodSpec, desired *corev1.PodSpec) { reconcileManagedContainer(live, desired) + // Volumes are builder-owned (e.g. the shm size differs by profile) and are not + // API-server-defaulted in a Deployment template, so copying them is churn-free. + live.Volumes = desired.Volumes + live.NodeSelector = desired.NodeSelector live.Affinity = desired.Affinity live.Tolerations = desired.Tolerations @@ -215,6 +219,9 @@ func reconcileManagedContainer(live *corev1.PodSpec, desired *corev1.PodSpec) { live.Containers[i].Command = want.Command live.Containers[i].Args = want.Args live.Containers[i].Env = want.Env + // Resources are builder-owned (the GPU limit differs by profile) and not + // API-server-defaulted, so reconciling them is churn-free. + live.Containers[i].Resources = want.Resources return } } diff --git a/internal/controller/cachebackend_controller_test.go b/internal/controller/cachebackend_controller_test.go index d6a6cfd..2c569f0 100644 --- a/internal/controller/cachebackend_controller_test.go +++ b/internal/controller/cachebackend_controller_test.go @@ -218,6 +218,44 @@ func TestReconcileLMCacheUpdatesImage(t *testing.T) { } } +func TestReconcileLMCacheProfileSwitchGPUToCPU(t *testing.T) { + scheme := newScheme(t) + r := newReconciler(scheme, lmcacheBackend("cache", "ns1")) + + reconcile(t, r, "cache", "ns1") + // GPU profile (default): GPU limit set, 8Gi shm. + c := getDeployment(t, r, "cache", "ns1").Spec.Template.Spec.Containers[0] + if _, ok := c.Resources.Limits["nvidia.com/gpu"]; !ok { + t.Fatalf("default profile should request a GPU") + } + + live := getBackend(t, r, "cache", "ns1") + live.Spec.BackendConfig = map[string]string{"profile": "cpu", "image": "vllm/vllm-openai-cpu:latest-arm64"} + if err := r.Update(context.Background(), live); err != nil { + t.Fatalf("switch to cpu profile: %v", err) + } + reconcile(t, r, "cache", "ns1") + + // CPU profile must reach the live Deployment: GPU limit gone, image swapped, shm 4Gi. + dep := getDeployment(t, r, "cache", "ns1") + c = dep.Spec.Template.Spec.Containers[0] + if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok { + t.Fatalf("GPU limit should be removed after switching to cpu profile") + } + if c.Image != "vllm/vllm-openai-cpu:latest-arm64" { + t.Fatalf("image = %q, want cpu image after profile switch", c.Image) + } + var shm *corev1.Volume + for i := range dep.Spec.Template.Spec.Volumes { + if dep.Spec.Template.Spec.Volumes[i].Name == "shm" { + shm = &dep.Spec.Template.Spec.Volumes[i] + } + } + if shm == nil || shm.EmptyDir == nil || shm.EmptyDir.SizeLimit == nil || shm.EmptyDir.SizeLimit.String() != "4Gi" { + t.Fatalf("shm size = %v, want 4Gi after switching to cpu profile", shm) + } +} + func TestReconcileLMCacheScalesReplicas(t *testing.T) { scheme := newScheme(t) cb := lmcacheBackend("cache", "ns1") diff --git a/pkg/adapters/backend/lmcache.go b/pkg/adapters/backend/lmcache.go index 9257f59..7cdb633 100644 --- a/pkg/adapters/backend/lmcache.go +++ b/pkg/adapters/backend/lmcache.go @@ -29,13 +29,12 @@ const ( defaultLMCacheMaxLocalCPUSize = "20" defaultHFTokenSecretName = "hf-token" - // CPU profile (backendConfig profile=cpu) defaults: a GPU-free vLLM engine for - // substrate validation off-GPU. It keeps prefix caching + the KV-event publisher - // but drops the LMCache connector — real LMCache offload requires a GPU, so the - // default (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local. - // The CPU image is arch-tagged upstream (latest-arm64/latest-x86_64); the bare - // :latest default is meant to be overridden via backendConfig.image per host arch. - defaultCPUImage = "vllm/vllm-openai-cpu:latest" + // CPU profile (backendConfig profile=cpu): a GPU-free vLLM engine for substrate + // validation off-GPU. It keeps prefix caching + the KV-event publisher but drops + // the LMCache connector — real LMCache offload requires a GPU, so the default + // (gpu) profile owns that. Mirrors docs/reference-stack/manifests/cpu-local. + // The upstream CPU image is arch-tagged (latest-arm64 / latest-x86_64) with no + // safe multi-arch default, so backendConfig.image is REQUIRED for this profile. defaultCPUModel = "Qwen/Qwen2.5-0.5B-Instruct" defaultCPUKVCacheSpace = "4" @@ -95,7 +94,12 @@ func (lmCacheBuilder) Build(cb *cachev1alpha1.CacheBackend) (*Workload, error) { var container corev1.Container var shmSize resource.Quantity if strings.EqualFold(configOr(cfg, cfgKeyProfile, profileGPU), profileCPU) { - image := configOr(cfg, cfgKeyImage, defaultCPUImage) + // The CPU image is arch-tagged upstream with no safe multi-arch default, + // so it must be supplied explicitly (e.g. vllm/vllm-openai-cpu:latest-arm64). + image := configOr(cfg, cfgKeyImage, "") + if image == "" { + return nil, fmt.Errorf("backendConfig.profile=cpu requires backendConfig.image (an arch-tagged CPU image, e.g. vllm/vllm-openai-cpu:latest-arm64)") + } model := configOr(cfg, cfgKeyModel, defaultCPUModel) container = cpuEngineContainer(image, model, hfSecret) shmSize = resource.MustParse("4Gi") diff --git a/pkg/adapters/backend/lmcache_test.go b/pkg/adapters/backend/lmcache_test.go index ab5af24..26e34aa 100644 --- a/pkg/adapters/backend/lmcache_test.go +++ b/pkg/adapters/backend/lmcache_test.go @@ -130,8 +130,11 @@ func TestLMCacheBuildCPUProfile(t *testing.T) { cb := &cachev1alpha1.CacheBackend{ ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"}, Spec: cachev1alpha1.CacheBackendSpec{ - Type: cachev1alpha1.CacheBackendTypeLMCache, - BackendConfig: map[string]string{cfgKeyProfile: "cpu"}, + Type: cachev1alpha1.CacheBackendTypeLMCache, + BackendConfig: map[string]string{ + cfgKeyProfile: "cpu", + cfgKeyImage: "vllm/vllm-openai-cpu:latest-arm64", + }, }, } b, _ := For(cachev1alpha1.CacheBackendTypeLMCache) @@ -141,8 +144,8 @@ func TestLMCacheBuildCPUProfile(t *testing.T) { } c := w.Deployment.Spec.Template.Spec.Containers[0] - if c.Image != defaultCPUImage { - t.Fatalf("image = %q, want CPU default %q", c.Image, defaultCPUImage) + if c.Image != "vllm/vllm-openai-cpu:latest-arm64" { + t.Fatalf("image = %q, want the supplied CPU image", c.Image) } if c.Command[len(c.Command)-1] != defaultCPUModel { t.Fatalf("model = %v, want CPU default %q", c.Command, defaultCPUModel) @@ -204,6 +207,20 @@ func TestLMCacheBuildCPUProfileOverrides(t *testing.T) { } } +func TestLMCacheBuildCPUProfileRequiresImage(t *testing.T) { + cb := &cachev1alpha1.CacheBackend{ + ObjectMeta: metav1.ObjectMeta{Name: "cache", Namespace: "ns1"}, + Spec: cachev1alpha1.CacheBackendSpec{ + Type: cachev1alpha1.CacheBackendTypeLMCache, + BackendConfig: map[string]string{cfgKeyProfile: "cpu"}, // no image + }, + } + b, _ := For(cachev1alpha1.CacheBackendTypeLMCache) + if _, err := b.Build(cb); err == nil { + t.Fatalf("expected an error: profile=cpu without an image has no safe default") + } +} + func TestLMCacheBuildDefaultProfileIsGPU(t *testing.T) { for _, profile := range []string{"", "gpu", "unknown"} { cb := &cachev1alpha1.CacheBackend{ From aa1c8d4edf0d5e6354082c81107d5114284c956a Mon Sep 17 00:00:00 2001 From: Edward Sun Date: Wed, 27 May 2026 14:48:17 -0700 Subject: [PATCH 3/3] Make the CPU sample image a non-applyable arch placeholder The CPU image is arch-tagged with no multi-arch tag, so a hardcoded latest-arm64 would fail on x86 hosts. Ship a deliberate :latest- placeholder (matching the reference stack's non-applyable-image convention) so applying as-is fails fast rather than silently running the wrong arch. --- config/samples/cachebackend-lmcache-cpu.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/config/samples/cachebackend-lmcache-cpu.yaml b/config/samples/cachebackend-lmcache-cpu.yaml index 3e76af1..0807f15 100644 --- a/config/samples/cachebackend-lmcache-cpu.yaml +++ b/config/samples/cachebackend-lmcache-cpu.yaml @@ -6,7 +6,10 @@ # the reconciler stand up a healthy, serving backend on a CPU-only cluster (e.g. # kind) to validate the engine-config + KV-event path end to end. # -# The CPU image is arch-tagged upstream; use latest-x86_64 on x86 hosts. +# IMPORTANT: the CPU image is arch-tagged upstream with no multi-arch tag, so the +# `image` below is a deliberate non-applyable PLACEHOLDER — substitute the tag for +# your host before applying: `latest-arm64` (arm64) or `latest-x86_64` (x86_64). +# Applying as-is fails fast (bad tag) rather than silently running the wrong arch. apiVersion: inferencecache.io/v1alpha1 kind: CacheBackend metadata: @@ -22,5 +25,6 @@ spec: role: ReadWrite backendConfig: profile: cpu - image: vllm/vllm-openai-cpu:latest-arm64 + # PLACEHOLDER — replace with arm64 or x86_64 for your host before applying. + image: vllm/vllm-openai-cpu:latest- model: Qwen/Qwen2.5-0.5B-Instruct