Skip to content

Commit 8811dc8

Browse files
amir20claude
andauthored
fix(cloud): resolve read-only container tools in one shot (no extra LLM round-trip) (#4767)
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 51e13d5 commit 8811dc8

7 files changed

Lines changed: 568 additions & 38 deletions

File tree

internal/cloud/tools.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ var (
6767
Properties: map[string]paramProperty{},
6868
})
6969

70-
containerIDParam = paramProperty{Type: "string", Description: "Container name or ID. You can pass the container name directly (as shown in logs, events, and find_containers) — it does not need to be the opaque ID. Resolved by exact name first, then ID, then a unique name substring. If the name matches more than one container the call fails with the list of candidates so you can disambiguate."}
70+
containerIDParam = paramProperty{Type: "string", Description: "Container name or ID. You can pass the container name directly (as shown in logs, events, and find_containers) — it does not need to be the opaque ID. Resolved by exact name first, then ID, then a unique name substring. When a name matches several containers (e.g. a Swarm service with stopped task corpses, or multiple replicas), read-only tools (inspect/logs) resolve to the most relevant one the running replica, or the most-recently-active if all are stopped — and tell you which one (and its siblings) in the result, so you don't need to look up the ID first. Write tools (start/stop/restart/remove/update) never guess between live containers: an ambiguous name fails with the candidate list so you can re-issue with an exact ID."}
7171
hostIDParam = paramProperty{Type: "string", Description: "Host name or ID (from list_hosts or find_containers). Optional — omit it when the container name is unique across all hosts; supply it (name or ID) only to scope to a specific host when a name is ambiguous."}
7272
boolFalse = false
7373

internal/cloud/tools_containers.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,11 @@ func executeInspectContainer(argsJSON string, deps ToolDeps) (*pb.CallToolRespon
168168
if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
169169
return nil, fmt.Errorf("failed to parse arguments: %w", err)
170170
}
171-
hostID, containerID, err := resolveContainerRef(args.ContainerID, args.Host, deps)
171+
// Read-only: resolve an ambiguous name in one shot rather than erroring and
172+
// forcing a find_containers round-trip. The note is discarded here because
173+
// inspect already returns the concrete id/name/state, which is itself the
174+
// disambiguation — no need to mangle those structured fields with prose.
175+
hostID, containerID, _, err := resolveContainerRefRead(args.ContainerID, args.Host, deps)
172176
if err != nil {
173177
return nil, err
174178
}

internal/cloud/tools_logs.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func executeFetchContainerLogs(ctx context.Context, argsJSON string, deps ToolDe
2727
if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
2828
return nil, fmt.Errorf("failed to parse arguments: %w", err)
2929
}
30-
hostID, containerID, err := resolveContainerRef(args.ContainerID, args.Host, deps)
30+
hostID, containerID, note, err := resolveContainerRefRead(args.ContainerID, args.Host, deps)
3131
if err != nil {
3232
return nil, err
3333
}
@@ -93,6 +93,17 @@ func executeFetchContainerLogs(ctx context.Context, argsJSON string, deps ToolDe
9393

9494
return &pb.CallToolResponse{
9595
Success: true,
96-
Result: &pb.CallToolResponse_FetchLogs{FetchLogs: &pb.FetchLogsResult{ContainerName: cs.Container.Name, Entries: entries}},
96+
Result: &pb.CallToolResponse_FetchLogs{FetchLogs: &pb.FetchLogsResult{ContainerName: withResolutionNote(cs.Container.Name, note), Entries: entries}},
9797
}, nil
9898
}
99+
100+
// withResolutionNote appends the read-path resolution note (if any) to a
101+
// model-visible identity string, so the model learns which container an
102+
// ambiguous name resolved to — and that siblings exist — in the same turn,
103+
// without a round-trip. Empty note returns name unchanged.
104+
func withResolutionNote(name, note string) string {
105+
if note == "" {
106+
return name
107+
}
108+
return name + " " + note
109+
}

internal/cloud/tools_resolve.go

Lines changed: 232 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ import (
1111
// id) plus an optional host reference (a name OR an id) into the concrete
1212
// (hostID, containerID) pair that HostService.FindContainer expects.
1313
//
14+
// This is the STRICT (write) resolver used by the destructive tools
15+
// (start/stop/restart/remove/update). It NEVER picks between multiple live
16+
// containers — acting on the wrong one is irreversible — so a name that matches
17+
// more than one container resolves only when exactly one candidate is running
18+
// (the others being stopped task corpses); otherwise it fails with the
19+
// candidate listing. Read-only tools use resolveContainerRefRead, which is
20+
// allowed to pick the most-relevant container in one shot (see that function).
21+
//
1422
// LLMs almost always have only the container name (it is what logs, events and
1523
// listings surface), so every container-scoped tool funnels through here rather
1624
// than passing the raw reference straight to FindContainer — which only matches
@@ -23,32 +31,128 @@ import (
2331
// 3. unique substring of the name
2432
//
2533
// The first tier that yields any candidates wins. An id match is always unique,
26-
// so it is never treated as ambiguous. If a name tier yields more than one
27-
// container the call fails with an error listing every candidate (name + id +
28-
// host) so the caller can disambiguate — we NEVER silently pick one. This is
29-
// what makes the write tools (stop/restart/remove/update) safe to drive by name.
34+
// so it is never treated as ambiguous. When a name tier yields more than one
35+
// container we try to break the tie by running state: if exactly one candidate
36+
// is running it is the unambiguous live referent (the others are stopped task
37+
// corpses left by Swarm redeploys / restart churn) and we resolve to it. Only
38+
// when the tie is real — zero running, or several live containers — does the
39+
// call fail with an error listing every candidate (name + id + state + host) so
40+
// the caller can disambiguate. We NEVER pick between multiple *live* containers,
41+
// which is what keeps the write tools (stop/restart/remove/update) safe by name.
3042
//
3143
// hostRef is optional: when empty the container must resolve unambiguously
3244
// across all hosts; when supplied it scopes the search to that host (matched by
3345
// id or name).
3446
func resolveContainerRef(containerRef, hostRef string, deps ToolDeps) (hostID, containerID string, err error) {
47+
tier, trimmedHostRef, scopedHostID, hostNames, err := matchContainerTier(containerRef, hostRef, deps)
48+
if err != nil {
49+
return "", "", err
50+
}
51+
52+
switch len(tier) {
53+
case 0:
54+
// No tier matched. When an explicit host was given (and resolved to a
55+
// real host id) but the listing produced no match — e.g. a host returned
56+
// a partial error and was omitted from ListAllContainers — pass the
57+
// reference straight through to FindContainer's direct lookup, exactly as
58+
// before this resolver existed. This guarantees id-based callers never
59+
// regress.
60+
if scopedHostID != "" {
61+
return scopedHostID, strings.TrimSpace(containerRef), nil
62+
}
63+
return "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", strings.TrimSpace(containerRef))
64+
case 1:
65+
return tier[0].Host, tier[0].ID, nil
66+
default:
67+
// Multiple matches. The usual benign cause is Docker Swarm (or plain
68+
// restart churn) leaving stopped task containers behind across redeploys
69+
// — the short name "svc.1" substring-matches every historical
70+
// "svc.1.<taskid>" corpse alongside the one live task. When exactly one
71+
// candidate is running it is the unambiguous live referent, so resolve to
72+
// it rather than make the caller hunt for an id. We still refuse to
73+
// choose between multiple *live* containers — that is the real ambiguity
74+
// the write tools must never guess at.
75+
if live := runningContainers(tier); len(live) == 1 {
76+
return live[0].Host, live[0].ID, nil
77+
}
78+
return "", "", ambiguousError(strings.TrimSpace(containerRef), trimmedHostRef, tier, hostNames)
79+
}
80+
}
81+
82+
// resolveContainerRefRead is the PERMISSIVE (read-only) resolver used by the
83+
// non-destructive tools (inspect_container, fetch_container_logs, stream_logs).
84+
//
85+
// It matches identically to resolveContainerRef, but when a name is ambiguous
86+
// it does NOT error — it resolves to the single most-relevant candidate in one
87+
// shot, eliminating the find-then-act round-trip that otherwise costs a whole
88+
// extra LLM call on the investigation path. The pick is safe precisely because
89+
// the tool is read-only: reading the wrong replica's logs is cheap and
90+
// recoverable, and same-service replicas are interchangeable for diagnosis,
91+
// whereas restarting the wrong one is not. The strict resolver above keeps the
92+
// write tools honest.
93+
//
94+
// Selection (bestCandidate): running containers beat stopped ones, then newest
95+
// StartedAt, then most-recently-finished/created. For the crash-loop case —
96+
// every "svc.1.*" task exited — this lands on the corpse that died most
97+
// recently, which is what the investigation is usually about.
98+
//
99+
// When it picks among several candidates it returns a one-line note naming the
100+
// chosen container and its siblings. The caller surfaces that note in the tool
101+
// RESULT (not via a round-trip), so the model gets the disambiguation context
102+
// for free in the same turn and can re-call with an explicit id if it genuinely
103+
// wants a different replica — but it is never forced to. note is empty when the
104+
// match was unique (nothing to disclose).
105+
func resolveContainerRefRead(containerRef, hostRef string, deps ToolDeps) (hostID, containerID, note string, err error) {
106+
tier, _, scopedHostID, hostNames, err := matchContainerTier(containerRef, hostRef, deps)
107+
if err != nil {
108+
return "", "", "", err
109+
}
110+
111+
switch len(tier) {
112+
case 0:
113+
if scopedHostID != "" {
114+
return scopedHostID, strings.TrimSpace(containerRef), "", nil
115+
}
116+
return "", "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", strings.TrimSpace(containerRef))
117+
case 1:
118+
return tier[0].Host, tier[0].ID, "", nil
119+
default:
120+
// Exactly one running candidate is unambiguous — the corpses are never a
121+
// valid target — so resolve cleanly with no note, identical to the strict
122+
// resolver. A note is only warranted when the read path actually exercises
123+
// its relaxation: zero running (pick the newest corpse) or several running
124+
// replicas (pick the newest live one).
125+
if live := runningContainers(tier); len(live) == 1 {
126+
return live[0].Host, live[0].ID, "", nil
127+
}
128+
best := bestCandidate(tier)
129+
note = resolutionNote(strings.TrimSpace(containerRef), best, tier, hostNames)
130+
return best.Host, best.ID, note, nil
131+
}
132+
}
133+
134+
// matchContainerTier runs the shared scoping + tiered matching used by both
135+
// resolvers and returns the winning tier (the first non-empty of exact-id,
136+
// exact-name, substring). It also returns the trimmed hostRef, the resolved
137+
// scoped host id (empty when no host was supplied), and the host-name map for
138+
// building human-readable notes/errors. An empty returned tier means no match.
139+
func matchContainerTier(containerRef, hostRef string, deps ToolDeps) (tier []container.Container, trimmedHostRef, scopedHostID string, hostNames map[string]string, err error) {
35140
containerRef = strings.TrimSpace(containerRef)
36141
if containerRef == "" {
37-
return "", "", fmt.Errorf("container_id is required")
142+
return nil, "", "", nil, fmt.Errorf("container_id is required")
38143
}
39144

40145
containers, errs := deps.HostService.ListAllContainers(deps.Labels)
41146
logHostErrors(errs)
42-
hostNames := buildHostNameMap(deps.HostService)
147+
hostNames = buildHostNameMap(deps.HostService)
43148

44149
// Scope to a host if one was supplied. The host reference may be an id or a
45150
// name; an unknown host is an explicit error rather than a silent no-match.
46-
hostRef = strings.TrimSpace(hostRef)
47-
var scopedHostID string
48-
if hostRef != "" {
49-
scopedHostID, err = resolveHostRef(hostRef, deps)
151+
trimmedHostRef = strings.TrimSpace(hostRef)
152+
if trimmedHostRef != "" {
153+
scopedHostID, err = resolveHostRef(trimmedHostRef, deps)
50154
if err != nil {
51-
return "", "", err
155+
return nil, trimmedHostRef, "", hostNames, err
52156
}
53157
filtered := containers[:0:0]
54158
for _, c := range containers {
@@ -75,27 +179,12 @@ func resolveContainerRef(containerRef, hostRef string, deps ToolDeps) (hostID, c
75179
}
76180
}
77181

78-
for _, tier := range [][]container.Container{exactID, exactName, substring} {
79-
switch len(tier) {
80-
case 0:
81-
continue
82-
case 1:
83-
return tier[0].Host, tier[0].ID, nil
84-
default:
85-
return "", "", ambiguousError(containerRef, hostRef, tier, hostNames)
182+
for _, t := range [][]container.Container{exactID, exactName, substring} {
183+
if len(t) > 0 {
184+
return t, trimmedHostRef, scopedHostID, hostNames, nil
86185
}
87186
}
88-
89-
// Legacy fallback: when an explicit host was given (and resolved to a real
90-
// host id) but the listing produced no match — e.g. a host returned a
91-
// partial error and was omitted from ListAllContainers — pass the reference
92-
// straight through to FindContainer's direct lookup, exactly as before this
93-
// resolver existed. This guarantees id-based callers never regress.
94-
if scopedHostID != "" {
95-
return scopedHostID, containerRef, nil
96-
}
97-
98-
return "", "", fmt.Errorf("no container matching %q found across all connected hosts; call find_containers to list available containers", containerRef)
187+
return nil, trimmedHostRef, scopedHostID, hostNames, nil
99188
}
100189

101190
// matchesID reports whether ref identifies the container id — either the full id
@@ -152,7 +241,7 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
152241
parts := make([]string, len(candidates))
153242
sameHost := true
154243
for i, c := range candidates {
155-
parts[i] = fmt.Sprintf("%s (id %s on host %s)", c.Name, shortID(c.ID), resolveHostName(c.Host, hostNames))
244+
parts[i] = fmt.Sprintf("%s (id %s, %s, on host %s)", c.Name, shortID(c.ID), describeState(c), resolveHostName(c.Host, hostNames))
156245
if c.Host != candidates[0].Host {
157246
sameHost = false
158247
}
@@ -170,6 +259,118 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
170259
return fmt.Errorf("%q matches multiple containers: %s. To act on the right one, %s", containerRef, strings.Join(parts, "; "), hint)
171260
}
172261

262+
// runningContainers returns only the candidates in the running state. It breaks
263+
// a multi-match tie down to the single live container when every other candidate
264+
// is a stopped corpse (the typical Swarm-redeploy / restart-churn case), which
265+
// is the one situation where picking among matches is unambiguous and safe.
266+
func runningContainers(cs []container.Container) []container.Container {
267+
live := make([]container.Container, 0, len(cs))
268+
for _, c := range cs {
269+
if strings.EqualFold(c.State, "running") {
270+
live = append(live, c)
271+
}
272+
}
273+
return live
274+
}
275+
276+
// bestCandidate picks the single most-relevant container from an ambiguous name
277+
// match for the read-only path. Ordering: running beats stopped, then newest
278+
// StartedAt, then most-recently-finished, then most-recently-created. For the
279+
// crash-loop case (every "svc.1.*" task exited) this lands on the corpse that
280+
// died most recently — the one the investigation is almost always about. The
281+
// caller guarantees len(cs) > 0.
282+
func bestCandidate(cs []container.Container) container.Container {
283+
best := cs[0]
284+
for _, c := range cs[1:] {
285+
if moreRelevant(c, best) {
286+
best = c
287+
}
288+
}
289+
return best
290+
}
291+
292+
// moreRelevant reports whether a should be preferred over b as the read-path
293+
// referent. Running always wins; among the same running-ness, the more-recently
294+
// active container wins (StartedAt, then FinishedAt, then Created).
295+
func moreRelevant(a, b container.Container) bool {
296+
aRunning, bRunning := isRunning(a), isRunning(b)
297+
if aRunning != bRunning {
298+
return aRunning
299+
}
300+
if !a.StartedAt.Equal(b.StartedAt) {
301+
return a.StartedAt.After(b.StartedAt)
302+
}
303+
if !a.FinishedAt.Equal(b.FinishedAt) {
304+
return a.FinishedAt.After(b.FinishedAt)
305+
}
306+
return a.Created.After(b.Created)
307+
}
308+
309+
func isRunning(c container.Container) bool {
310+
return strings.EqualFold(c.State, "running")
311+
}
312+
313+
// resolutionNote renders the one-line transparency note the read-only tools
314+
// surface in their result when a name resolved to one of several candidates. It
315+
// names the chosen container (state-annotated), summarizes how many siblings of
316+
// each running-ness exist, and lists the sibling names — so the model can re-call
317+
// with an explicit id if it wants a different replica, without ever being forced
318+
// to. Returned as a parenthetical prefix the tools prepend to a human-visible
319+
// field.
320+
func resolutionNote(containerRef string, chosen container.Container, candidates []container.Container, hostNames map[string]string) string {
321+
running := len(runningContainers(candidates))
322+
323+
// Annotate sibling names with their host only when the candidates span more
324+
// than one host — that is the case where the model needs the host to re-target
325+
// a sibling; on a single host it would just be noise.
326+
multiHost := false
327+
for _, c := range candidates {
328+
if c.Host != chosen.Host {
329+
multiHost = true
330+
break
331+
}
332+
}
333+
334+
siblings := make([]string, 0, len(candidates)-1)
335+
for _, c := range candidates {
336+
if c.ID == chosen.ID {
337+
continue
338+
}
339+
if multiHost {
340+
siblings = append(siblings, fmt.Sprintf("%s on host %s", c.Name, resolveHostName(c.Host, hostNames)))
341+
} else {
342+
siblings = append(siblings, c.Name)
343+
}
344+
}
345+
346+
var summary string
347+
switch {
348+
case running > 1 && isRunning(chosen):
349+
summary = fmt.Sprintf("newest of %d running replicas", running)
350+
case running == 0:
351+
summary = fmt.Sprintf("most recently active of %d stopped containers", len(candidates))
352+
default:
353+
summary = fmt.Sprintf("1 of %d matching containers", len(candidates))
354+
}
355+
356+
return fmt.Sprintf("(resolved %q → %s [%s], %s; others: %s)",
357+
containerRef, chosen.Name, describeState(chosen), summary, strings.Join(siblings, ", "))
358+
}
359+
360+
// describeState renders a candidate's state (with health when known) for the
361+
// ambiguity listing, so the caller can tell a live replica from a stopped corpse
362+
// and pick the right one. A blank state is reported as "unknown".
363+
func describeState(c container.Container) string {
364+
state := c.State
365+
if state == "" {
366+
state = "unknown"
367+
}
368+
if c.Health != "" {
369+
return fmt.Sprintf("%s (%s)", state, c.Health)
370+
}
371+
return state
372+
}
373+
173374
// shortID trims a full docker id to its conventional 12-character form for
174375
// display in errors.
175376
func shortID(id string) string {

0 commit comments

Comments
 (0)