@@ -11,6 +11,14 @@ import (
1111// id) plus an optional host reference (a name OR an id) into the concrete
1212// (hostID, containerID) pair that HostService.FindContainer expects.
1313//
14+ // This is the STRICT (write) resolver used by the destructive tools
15+ // (start/stop/restart/remove/update). It NEVER picks between multiple live
16+ // containers — acting on the wrong one is irreversible — so a name that matches
17+ // more than one container resolves only when exactly one candidate is running
18+ // (the others being stopped task corpses); otherwise it fails with the
19+ // candidate listing. Read-only tools use resolveContainerRefRead, which is
20+ // allowed to pick the most-relevant container in one shot (see that function).
21+ //
1422// LLMs almost always have only the container name (it is what logs, events and
1523// listings surface), so every container-scoped tool funnels through here rather
1624// than passing the raw reference straight to FindContainer — which only matches
@@ -23,32 +31,128 @@ import (
2331// 3. unique substring of the name
2432//
2533// The first tier that yields any candidates wins. An id match is always unique,
26- // so it is never treated as ambiguous. If a name tier yields more than one
27- // container the call fails with an error listing every candidate (name + id +
28- // host) so the caller can disambiguate — we NEVER silently pick one. This is
29- // what makes the write tools (stop/restart/remove/update) safe to drive by name.
34+ // so it is never treated as ambiguous. When a name tier yields more than one
35+ // container we try to break the tie by running state: if exactly one candidate
36+ // is running it is the unambiguous live referent (the others are stopped task
37+ // corpses left by Swarm redeploys / restart churn) and we resolve to it. Only
38+ // when the tie is real — zero running, or several live containers — does the
39+ // call fail with an error listing every candidate (name + id + state + host) so
40+ // the caller can disambiguate. We NEVER pick between multiple *live* containers,
41+ // which is what keeps the write tools (stop/restart/remove/update) safe by name.
3042//
3143// hostRef is optional: when empty the container must resolve unambiguously
3244// across all hosts; when supplied it scopes the search to that host (matched by
3345// id or name).
3446func resolveContainerRef (containerRef , hostRef string , deps ToolDeps ) (hostID , containerID string , err error ) {
47+ tier , trimmedHostRef , scopedHostID , hostNames , err := matchContainerTier (containerRef , hostRef , deps )
48+ if err != nil {
49+ return "" , "" , err
50+ }
51+
52+ switch len (tier ) {
53+ case 0 :
54+ // No tier matched. When an explicit host was given (and resolved to a
55+ // real host id) but the listing produced no match — e.g. a host returned
56+ // a partial error and was omitted from ListAllContainers — pass the
57+ // reference straight through to FindContainer's direct lookup, exactly as
58+ // before this resolver existed. This guarantees id-based callers never
59+ // regress.
60+ if scopedHostID != "" {
61+ return scopedHostID , strings .TrimSpace (containerRef ), nil
62+ }
63+ return "" , "" , fmt .Errorf ("no container matching %q found across all connected hosts; call find_containers to list available containers" , strings .TrimSpace (containerRef ))
64+ case 1 :
65+ return tier [0 ].Host , tier [0 ].ID , nil
66+ default :
67+ // Multiple matches. The usual benign cause is Docker Swarm (or plain
68+ // restart churn) leaving stopped task containers behind across redeploys
69+ // — the short name "svc.1" substring-matches every historical
70+ // "svc.1.<taskid>" corpse alongside the one live task. When exactly one
71+ // candidate is running it is the unambiguous live referent, so resolve to
72+ // it rather than make the caller hunt for an id. We still refuse to
73+ // choose between multiple *live* containers — that is the real ambiguity
74+ // the write tools must never guess at.
75+ if live := runningContainers (tier ); len (live ) == 1 {
76+ return live [0 ].Host , live [0 ].ID , nil
77+ }
78+ return "" , "" , ambiguousError (strings .TrimSpace (containerRef ), trimmedHostRef , tier , hostNames )
79+ }
80+ }
81+
82+ // resolveContainerRefRead is the PERMISSIVE (read-only) resolver used by the
83+ // non-destructive tools (inspect_container, fetch_container_logs, stream_logs).
84+ //
85+ // It matches identically to resolveContainerRef, but when a name is ambiguous
86+ // it does NOT error — it resolves to the single most-relevant candidate in one
87+ // shot, eliminating the find-then-act round-trip that otherwise costs a whole
88+ // extra LLM call on the investigation path. The pick is safe precisely because
89+ // the tool is read-only: reading the wrong replica's logs is cheap and
90+ // recoverable, and same-service replicas are interchangeable for diagnosis,
91+ // whereas restarting the wrong one is not. The strict resolver above keeps the
92+ // write tools honest.
93+ //
94+ // Selection (bestCandidate): running containers beat stopped ones, then newest
95+ // StartedAt, then most-recently-finished/created. For the crash-loop case —
96+ // every "svc.1.*" task exited — this lands on the corpse that died most
97+ // recently, which is what the investigation is usually about.
98+ //
99+ // When it picks among several candidates it returns a one-line note naming the
100+ // chosen container and its siblings. The caller surfaces that note in the tool
101+ // RESULT (not via a round-trip), so the model gets the disambiguation context
102+ // for free in the same turn and can re-call with an explicit id if it genuinely
103+ // wants a different replica — but it is never forced to. note is empty when the
104+ // match was unique (nothing to disclose).
105+ func resolveContainerRefRead (containerRef , hostRef string , deps ToolDeps ) (hostID , containerID , note string , err error ) {
106+ tier , _ , scopedHostID , hostNames , err := matchContainerTier (containerRef , hostRef , deps )
107+ if err != nil {
108+ return "" , "" , "" , err
109+ }
110+
111+ switch len (tier ) {
112+ case 0 :
113+ if scopedHostID != "" {
114+ return scopedHostID , strings .TrimSpace (containerRef ), "" , nil
115+ }
116+ return "" , "" , "" , fmt .Errorf ("no container matching %q found across all connected hosts; call find_containers to list available containers" , strings .TrimSpace (containerRef ))
117+ case 1 :
118+ return tier [0 ].Host , tier [0 ].ID , "" , nil
119+ default :
120+ // Exactly one running candidate is unambiguous — the corpses are never a
121+ // valid target — so resolve cleanly with no note, identical to the strict
122+ // resolver. A note is only warranted when the read path actually exercises
123+ // its relaxation: zero running (pick the newest corpse) or several running
124+ // replicas (pick the newest live one).
125+ if live := runningContainers (tier ); len (live ) == 1 {
126+ return live [0 ].Host , live [0 ].ID , "" , nil
127+ }
128+ best := bestCandidate (tier )
129+ note = resolutionNote (strings .TrimSpace (containerRef ), best , tier , hostNames )
130+ return best .Host , best .ID , note , nil
131+ }
132+ }
133+
134+ // matchContainerTier runs the shared scoping + tiered matching used by both
135+ // resolvers and returns the winning tier (the first non-empty of exact-id,
136+ // exact-name, substring). It also returns the trimmed hostRef, the resolved
137+ // scoped host id (empty when no host was supplied), and the host-name map for
138+ // building human-readable notes/errors. An empty returned tier means no match.
139+ func matchContainerTier (containerRef , hostRef string , deps ToolDeps ) (tier []container.Container , trimmedHostRef , scopedHostID string , hostNames map [string ]string , err error ) {
35140 containerRef = strings .TrimSpace (containerRef )
36141 if containerRef == "" {
37- return "" , "" , fmt .Errorf ("container_id is required" )
142+ return nil , "" , "" , nil , fmt .Errorf ("container_id is required" )
38143 }
39144
40145 containers , errs := deps .HostService .ListAllContainers (deps .Labels )
41146 logHostErrors (errs )
42- hostNames : = buildHostNameMap (deps .HostService )
147+ hostNames = buildHostNameMap (deps .HostService )
43148
44149 // Scope to a host if one was supplied. The host reference may be an id or a
45150 // name; an unknown host is an explicit error rather than a silent no-match.
46- hostRef = strings .TrimSpace (hostRef )
47- var scopedHostID string
48- if hostRef != "" {
49- scopedHostID , err = resolveHostRef (hostRef , deps )
151+ trimmedHostRef = strings .TrimSpace (hostRef )
152+ if trimmedHostRef != "" {
153+ scopedHostID , err = resolveHostRef (trimmedHostRef , deps )
50154 if err != nil {
51- return "" , "" , err
155+ return nil , trimmedHostRef , "" , hostNames , err
52156 }
53157 filtered := containers [:0 :0 ]
54158 for _ , c := range containers {
@@ -75,27 +179,12 @@ func resolveContainerRef(containerRef, hostRef string, deps ToolDeps) (hostID, c
75179 }
76180 }
77181
78- for _ , tier := range [][]container.Container {exactID , exactName , substring } {
79- switch len (tier ) {
80- case 0 :
81- continue
82- case 1 :
83- return tier [0 ].Host , tier [0 ].ID , nil
84- default :
85- return "" , "" , ambiguousError (containerRef , hostRef , tier , hostNames )
182+ for _ , t := range [][]container.Container {exactID , exactName , substring } {
183+ if len (t ) > 0 {
184+ return t , trimmedHostRef , scopedHostID , hostNames , nil
86185 }
87186 }
88-
89- // Legacy fallback: when an explicit host was given (and resolved to a real
90- // host id) but the listing produced no match — e.g. a host returned a
91- // partial error and was omitted from ListAllContainers — pass the reference
92- // straight through to FindContainer's direct lookup, exactly as before this
93- // resolver existed. This guarantees id-based callers never regress.
94- if scopedHostID != "" {
95- return scopedHostID , containerRef , nil
96- }
97-
98- return "" , "" , fmt .Errorf ("no container matching %q found across all connected hosts; call find_containers to list available containers" , containerRef )
187+ return nil , trimmedHostRef , scopedHostID , hostNames , nil
99188}
100189
101190// matchesID reports whether ref identifies the container id — either the full id
@@ -152,7 +241,7 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
152241 parts := make ([]string , len (candidates ))
153242 sameHost := true
154243 for i , c := range candidates {
155- parts [i ] = fmt .Sprintf ("%s (id %s on host %s)" , c .Name , shortID (c .ID ), resolveHostName (c .Host , hostNames ))
244+ parts [i ] = fmt .Sprintf ("%s (id %s, %s, on host %s)" , c .Name , shortID (c .ID ), describeState ( c ), resolveHostName (c .Host , hostNames ))
156245 if c .Host != candidates [0 ].Host {
157246 sameHost = false
158247 }
@@ -170,6 +259,118 @@ func ambiguousError(containerRef, hostRef string, candidates []container.Contain
170259 return fmt .Errorf ("%q matches multiple containers: %s. To act on the right one, %s" , containerRef , strings .Join (parts , "; " ), hint )
171260}
172261
262+ // runningContainers returns only the candidates in the running state. It breaks
263+ // a multi-match tie down to the single live container when every other candidate
264+ // is a stopped corpse (the typical Swarm-redeploy / restart-churn case), which
265+ // is the one situation where picking among matches is unambiguous and safe.
266+ func runningContainers (cs []container.Container ) []container.Container {
267+ live := make ([]container.Container , 0 , len (cs ))
268+ for _ , c := range cs {
269+ if strings .EqualFold (c .State , "running" ) {
270+ live = append (live , c )
271+ }
272+ }
273+ return live
274+ }
275+
276+ // bestCandidate picks the single most-relevant container from an ambiguous name
277+ // match for the read-only path. Ordering: running beats stopped, then newest
278+ // StartedAt, then most-recently-finished, then most-recently-created. For the
279+ // crash-loop case (every "svc.1.*" task exited) this lands on the corpse that
280+ // died most recently — the one the investigation is almost always about. The
281+ // caller guarantees len(cs) > 0.
282+ func bestCandidate (cs []container.Container ) container.Container {
283+ best := cs [0 ]
284+ for _ , c := range cs [1 :] {
285+ if moreRelevant (c , best ) {
286+ best = c
287+ }
288+ }
289+ return best
290+ }
291+
292+ // moreRelevant reports whether a should be preferred over b as the read-path
293+ // referent. Running always wins; among the same running-ness, the more-recently
294+ // active container wins (StartedAt, then FinishedAt, then Created).
295+ func moreRelevant (a , b container.Container ) bool {
296+ aRunning , bRunning := isRunning (a ), isRunning (b )
297+ if aRunning != bRunning {
298+ return aRunning
299+ }
300+ if ! a .StartedAt .Equal (b .StartedAt ) {
301+ return a .StartedAt .After (b .StartedAt )
302+ }
303+ if ! a .FinishedAt .Equal (b .FinishedAt ) {
304+ return a .FinishedAt .After (b .FinishedAt )
305+ }
306+ return a .Created .After (b .Created )
307+ }
308+
309+ func isRunning (c container.Container ) bool {
310+ return strings .EqualFold (c .State , "running" )
311+ }
312+
313+ // resolutionNote renders the one-line transparency note the read-only tools
314+ // surface in their result when a name resolved to one of several candidates. It
315+ // names the chosen container (state-annotated), summarizes how many siblings of
316+ // each running-ness exist, and lists the sibling names — so the model can re-call
317+ // with an explicit id if it wants a different replica, without ever being forced
318+ // to. Returned as a parenthetical prefix the tools prepend to a human-visible
319+ // field.
320+ func resolutionNote (containerRef string , chosen container.Container , candidates []container.Container , hostNames map [string ]string ) string {
321+ running := len (runningContainers (candidates ))
322+
323+ // Annotate sibling names with their host only when the candidates span more
324+ // than one host — that is the case where the model needs the host to re-target
325+ // a sibling; on a single host it would just be noise.
326+ multiHost := false
327+ for _ , c := range candidates {
328+ if c .Host != chosen .Host {
329+ multiHost = true
330+ break
331+ }
332+ }
333+
334+ siblings := make ([]string , 0 , len (candidates )- 1 )
335+ for _ , c := range candidates {
336+ if c .ID == chosen .ID {
337+ continue
338+ }
339+ if multiHost {
340+ siblings = append (siblings , fmt .Sprintf ("%s on host %s" , c .Name , resolveHostName (c .Host , hostNames )))
341+ } else {
342+ siblings = append (siblings , c .Name )
343+ }
344+ }
345+
346+ var summary string
347+ switch {
348+ case running > 1 && isRunning (chosen ):
349+ summary = fmt .Sprintf ("newest of %d running replicas" , running )
350+ case running == 0 :
351+ summary = fmt .Sprintf ("most recently active of %d stopped containers" , len (candidates ))
352+ default :
353+ summary = fmt .Sprintf ("1 of %d matching containers" , len (candidates ))
354+ }
355+
356+ return fmt .Sprintf ("(resolved %q → %s [%s], %s; others: %s)" ,
357+ containerRef , chosen .Name , describeState (chosen ), summary , strings .Join (siblings , ", " ))
358+ }
359+
360+ // describeState renders a candidate's state (with health when known) for the
361+ // ambiguity listing, so the caller can tell a live replica from a stopped corpse
362+ // and pick the right one. A blank state is reported as "unknown".
363+ func describeState (c container.Container ) string {
364+ state := c .State
365+ if state == "" {
366+ state = "unknown"
367+ }
368+ if c .Health != "" {
369+ return fmt .Sprintf ("%s (%s)" , state , c .Health )
370+ }
371+ return state
372+ }
373+
173374// shortID trims a full docker id to its conventional 12-character form for
174375// display in errors.
175376func shortID (id string ) string {
0 commit comments