Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions components/operator/internal/handlers/sessions.go
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,31 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
Protocol: corev1.ProtocolTCP,
}},

ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt32(runnerPort),
},
},
InitialDelaySeconds: 3,
PeriodSeconds: 5,
TimeoutSeconds: 2,
FailureThreshold: 3,
},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt32(runnerPort),
},
},
InitialDelaySeconds: 20,
PeriodSeconds: 30,
TimeoutSeconds: 5,
FailureThreshold: 3,
},
Comment on lines +1047 to +1070
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify probe config and restart policy coexist in the runner pod spec.
rg -n -C2 'RestartPolicy:\s+corev1\.RestartPolicy|ReadinessProbe:|LivenessProbe:' components/operator/internal/handlers/sessions.go

Repository: ambient-code/platform

Length of output: 595


Liveness probe will not restart containers due to RestartPolicyNever

The liveness probe will kill unresponsive runners, but the pod spec's RestartPolicyNever (line 919) prevents container restart. This defeats the self-healing goal.

Suggested fix
-    RestartPolicy:                 corev1.RestartPolicyNever,
+    RestartPolicy:                 corev1.RestartPolicyOnFailure,
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@components/operator/internal/handlers/sessions.go` around lines 1047 - 1070,
The pod spec currently sets RestartPolicyNever which prevents containers from
being restarted when the liveness probe kills an unresponsive runner; change the
pod's RestartPolicy from RestartPolicyNever to corev1.RestartPolicyAlways (or
corev1.RestartPolicyOnFailure if you prefer restarts only on non-zero exits)
where the PodSpec is constructed (replace RestartPolicyNever with
corev1.RestartPolicyAlways in the code that sets the pod's RestartPolicy).


VolumeMounts: runnerVolumeMounts,

// Lifecycle hook to copy Google credentials from read-only secret mount to writable workspace
Expand Down
25 changes: 20 additions & 5 deletions components/runners/ambient-runner/ambient_runner/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,14 +258,20 @@ def add_ambient_endpoints(
# This prevents cross-session attacks where an attacker uses another session's runner URL.
_agui_token = os.getenv("AGUI_TOKEN", "").strip()
if _agui_token:

@app.middleware("http")
async def _require_session_token(request: Request, call_next):
if request.url.path not in ("/health", "/healthz"):
provided = request.headers.get("X-Ambient-Session-Token", "")
# Use constant-time comparison to prevent timing attacks
if not provided or not _secrets_mod.compare_digest(provided, _agui_token):
return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
if not provided or not _secrets_mod.compare_digest(
provided, _agui_token
):
return JSONResponse(
status_code=401, content={"detail": "Unauthorized"}
)
return await call_next(request)

logger.info("AG-UI token authentication enabled")

# Core endpoints (always registered)
Expand Down Expand Up @@ -516,6 +522,7 @@ async def _push_initial_prompt_via_http(prompt: str, session_id: str) -> None:
}

backoff = _AUTO_PROMPT_INITIAL_DELAY
last_exc: Exception | None = None
for attempt in range(1, _AUTO_PROMPT_MAX_RETRIES + 1):
# Re-read token each attempt — volume mount may not be ready at first try
bot_token = get_bot_token()
Expand Down Expand Up @@ -552,17 +559,25 @@ async def _push_initial_prompt_via_http(prompt: str, session_id: str) -> None:
)
return
except Exception as e:
last_exc = e
logger.warning(
f"INITIAL_PROMPT attempt {attempt}/{_AUTO_PROMPT_MAX_RETRIES} "
f"error: {e}, retrying in {backoff:.0f}s"
"INITIAL_PROMPT attempt %d/%d error: %s(%s), retrying in %.0fs",
attempt,
_AUTO_PROMPT_MAX_RETRIES,
type(e).__name__,
e,
backoff,
)

await asyncio.sleep(backoff)
backoff = min(backoff * 2, _AUTO_PROMPT_MAX_DELAY)
payload["runId"] = str(uuid.uuid4())

logger.error(
f"INITIAL_PROMPT auto-execution failed after {_AUTO_PROMPT_MAX_RETRIES} attempts"
"INITIAL_PROMPT auto-execution failed after %d attempts (last error: %s(%s))",
_AUTO_PROMPT_MAX_RETRIES,
type(last_exc).__name__ if last_exc is not None else "unknown",
last_exc if last_exc is not None else "",
)


Expand Down
Loading