diff --git a/packages/agent-core/package.json b/packages/agent-core/package.json
index 2437390..ce5964f 100644
--- a/packages/agent-core/package.json
+++ b/packages/agent-core/package.json
@@ -8,6 +8,9 @@
     "dev": "tsc --watch",
     "build": "tsc",
     "check:harness": "tsx src/harness/__fixtures__/check.ts",
+    "check:session-registry": "tsx src/__fixtures__/check-session-registry.ts",
+    "check:mcp-shim": "tsc && tsx src/harness/__fixtures__/check-mcp-shim.ts",
+    "check:mcp-shim-runtime": "tsc && tsx src/harness/__fixtures__/check-mcp-shim-runtime.ts",
     "eval": "tsx src/evals/index.ts",
     "eval:tools": "tsx src/evals/index.ts tools",
     "eval:safety": "tsx src/evals/index.ts safety",
diff --git a/packages/agent-core/src/__fixtures__/check-session-registry.ts b/packages/agent-core/src/__fixtures__/check-session-registry.ts
new file mode 100644
index 0000000..1afd551
--- /dev/null
+++ b/packages/agent-core/src/__fixtures__/check-session-registry.ts
@@ -0,0 +1,457 @@
+/**
+ * SessionRegistry integration checks.
+ *
+ * Run with:  pnpm --filter @anton/agent-core check:session-registry
+ *
+ * Exercises the contract the server depends on:
+ *   - Partitioned LRU: sessions in one pool don't evict sessions in
+ *     another.
+ *   - Pinning: a pinned session is skipped during eviction even when
+ *     it's the oldest candidate.
+ *   - Recency floor: sessions inserted within the floor window are
+ *     protected from eviction; the pool temporarily goes over capacity
+ *     and logs a warn.
+ *   - `delete()` awaits `shutdown()` so callers can block on cleanup
+ *     (what the fix for the destroy-leak depends on).
+ *   - Replace-in-place on duplicate id doesn't shut down the prior
+ *     session (that's the responsibility of explicit switch paths).
+ */
+
+import { SessionRegistry, type Shutdownable } from '../session-registry.js'
+
+class FakeSession implements Shutdownable {
+  shutdownCount = 0
+  shutdownAt = 0
+  constructor(public readonly id: string) {}
+  async shutdown(): Promise<void> {
+    this.shutdownCount += 1
+    this.shutdownAt = Date.now()
+  }
+}
+
+interface Case {
+  name: string
+  run: () => Promise<string | null> // null = pass; string = failure message
+}
+
+async function sleep(ms: number): Promise<void> {
+  await new Promise((r) => setTimeout(r, ms))
+}
+
+const cases: Case[] = [
+  {
+    name: 'put then get returns the same session',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>()
+      const a = new FakeSession('a')
+      reg.put('a', a, 'conversation')
+      return reg.get('a') === a ? null : 'get did not return put session'
+    },
+  },
+  {
+    name: 'delete() awaits shutdown()',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>()
+      const a = new FakeSession('a')
+      reg.put('a', a, 'conversation')
+      await reg.delete('a')
+      if (a.shutdownCount !== 1) return `shutdown called ${a.shutdownCount}x, expected 1`
+      if (reg.has('a')) return 'session still registered after delete'
+      return null
+    },
+  },
+  {
+    name: 'partitioned pools: filling conversation does not evict routine',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 2, recencyFloorMs: 0 },
+          routine: { maxSessions: 2, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 2, recencyFloorMs: 0 },
+        },
+      })
+      const r1 = new FakeSession('r1')
+      reg.put('r1', r1, 'routine')
+      // Fill conversation pool past capacity.
+      for (let i = 0; i < 5; i++) {
+        reg.put(`c${i}`, new FakeSession(`c${i}`), 'conversation')
+        await sleep(1) // ensure distinct lastAccess
+      }
+      // Let evictions fire-and-forget through the microtask queue.
+      await sleep(20)
+      if (!reg.has('r1')) return 'routine session was evicted by conversation pressure'
+      if (r1.shutdownCount !== 0) return 'routine session was shut down'
+      return null
+    },
+  },
+  {
+    name: 'LRU eviction picks the oldest non-pinned entry',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 2, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+      })
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      reg.put('a', a, 'conversation')
+      await sleep(5)
+      reg.put('b', b, 'conversation')
+      await sleep(5)
+      // Touch 'a' so 'b' becomes the LRU victim.
+      reg.get('a')
+      await sleep(5)
+      reg.put('c', new FakeSession('c'), 'conversation')
+      await sleep(20) // let background shutdown fire
+      if (!reg.has('a')) return "'a' was evicted despite being touched more recently"
+      if (reg.has('b')) return "'b' should have been evicted as LRU"
+      if (b.shutdownCount !== 1) return `b.shutdown called ${b.shutdownCount}x, expected 1`
+      return null
+    },
+  },
+  {
+    name: 'pinned session is skipped during eviction',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 2, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+      })
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      reg.put('a', a, 'conversation')
+      await sleep(5)
+      reg.put('b', b, 'conversation')
+      await sleep(5)
+      reg.pin('a') // oldest, but pinned
+      reg.put('c', new FakeSession('c'), 'conversation')
+      await sleep(20)
+      if (!reg.has('a')) return 'pinned session was evicted'
+      if (reg.has('b')) return 'b should have been evicted (next-oldest non-pinned)'
+      return null
+    },
+  },
+  {
+    name: 'recency floor keeps pool over capacity when no victim is old enough',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 1, recencyFloorMs: 5_000 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+      })
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      reg.put('a', a, 'conversation')
+      reg.put('b', b, 'conversation') // over capacity but 'a' is within floor
+      await sleep(20)
+      if (!reg.has('a')) return "'a' was evicted within its recency floor"
+      if (!reg.has('b')) return "'b' insert should have succeeded even with no eligible victim"
+      if (a.shutdownCount !== 0) return 'a was shut down while within recency floor'
+      return null
+    },
+  },
+  {
+    name: 'replace-in-place on duplicate id does not shut down prior session',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>()
+      const a1 = new FakeSession('a')
+      const a2 = new FakeSession('a')
+      reg.put('a', a1, 'conversation')
+      reg.put('a', a2, 'conversation')
+      if (a1.shutdownCount !== 0) return 'replace-in-place invoked shutdown on prior session'
+      if (reg.get('a') !== a2) return 'replaced session not returned by get()'
+      return null
+    },
+  },
+  {
+    name: 'peek() does not bump recency',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 2, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+      })
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      reg.put('a', a, 'conversation')
+      await sleep(5)
+      reg.put('b', b, 'conversation')
+      await sleep(5)
+      reg.peek('a') // must NOT touch LRU
+      await sleep(5)
+      reg.put('c', new FakeSession('c'), 'conversation')
+      await sleep(20)
+      // 'a' is oldest and peek didn't touch it, so 'a' should be evicted.
+      if (reg.has('a')) return 'peek() bumped recency (a should have been evicted)'
+      return null
+    },
+  },
+  {
+    name: 'shutdownAll shuts down every registered session',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>()
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      const c = new FakeSession('c')
+      reg.put('a', a, 'conversation')
+      reg.put('b', b, 'routine')
+      reg.put('c', c, 'ephemeral')
+      await reg.shutdownAll()
+      if (a.shutdownCount !== 1 || b.shutdownCount !== 1 || c.shutdownCount !== 1) {
+        return `shutdown counts: a=${a.shutdownCount} b=${b.shutdownCount} c=${c.shutdownCount}`
+      }
+      if (reg.size() !== 0) return 'registry not empty after shutdownAll'
+      return null
+    },
+  },
+  {
+    name: 'session without shutdown() is handled without throwing',
+    run: async () => {
+      const reg = new SessionRegistry()
+      reg.put('noop', { id: 'noop' }, 'conversation')
+      await reg.delete('noop')
+      if (reg.has('noop')) return 'session still registered'
+      return null
+    },
+  },
+  {
+    name: 'shutdown throwing does not break delete',
+    run: async () => {
+      const reg = new SessionRegistry()
+      const bad = {
+        id: 'bad',
+        shutdown: () => {
+          throw new Error('boom')
+        },
+      }
+      reg.put('bad', bad, 'conversation')
+      await reg.delete('bad') // must resolve
+      if (reg.has('bad')) return 'session still registered after delete'
+      return null
+    },
+  },
+  {
+    name: 'onEvict fires on LRU eviction with (id, session) and before shutdown',
+    run: async () => {
+      const calls: Array<{ id: string; shutdownAtCall: number }> = []
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 2, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+        onEvict: (id, session) => {
+          calls.push({ id, shutdownAtCall: session.shutdownCount })
+        },
+      })
+      const a = new FakeSession('a')
+      const b = new FakeSession('b')
+      reg.put('a', a, 'conversation')
+      await sleep(5)
+      reg.put('b', b, 'conversation')
+      await sleep(5)
+      reg.put('c', new FakeSession('c'), 'conversation') // evicts 'a'
+      await sleep(20)
+      if (calls.length !== 1) return `onEvict called ${calls.length}x, expected 1`
+      if (calls[0].id !== 'a') return `onEvict got id=${calls[0].id}, expected a`
+      if (calls[0].shutdownAtCall !== 0)
+        return 'onEvict ran after shutdown — must run before so external bookkeeping is cleared first'
+      if (a.shutdownCount !== 1) return `a.shutdown called ${a.shutdownCount}x, expected 1`
+      return null
+    },
+  },
+  {
+    name: 'onEvict does NOT fire on explicit delete',
+    run: async () => {
+      let evictCalls = 0
+      const reg = new SessionRegistry<FakeSession>({
+        onEvict: () => {
+          evictCalls += 1
+        },
+      })
+      const a = new FakeSession('a')
+      reg.put('a', a, 'conversation')
+      await reg.delete('a')
+      if (evictCalls !== 0) return `onEvict called ${evictCalls}x on delete, expected 0`
+      if (a.shutdownCount !== 1) return 'shutdown did not run on delete'
+      return null
+    },
+  },
+  {
+    name: 'onEvict does NOT fire on shutdownAll',
+    run: async () => {
+      let evictCalls = 0
+      const reg = new SessionRegistry<FakeSession>({
+        onEvict: () => {
+          evictCalls += 1
+        },
+      })
+      reg.put('a', new FakeSession('a'), 'conversation')
+      reg.put('b', new FakeSession('b'), 'routine')
+      await reg.shutdownAll()
+      if (evictCalls !== 0) return `onEvict called ${evictCalls}x on shutdownAll, expected 0`
+      return null
+    },
+  },
+  {
+    name: 'onEvict throwing does not block shutdown of the evicted session',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 1, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+        onEvict: () => {
+          throw new Error('boom')
+        },
+      })
+      const a = new FakeSession('a')
+      reg.put('a', a, 'conversation')
+      await sleep(5)
+      reg.put('b', new FakeSession('b'), 'conversation') // evicts 'a'
+      await sleep(20)
+      if (a.shutdownCount !== 1)
+        return `a.shutdown called ${a.shutdownCount}x despite onEvict throw, expected 1`
+      return null
+    },
+  },
+  {
+    name: 'destroy→create race: put() during delete()-await inserts cleanly and has() sees new',
+    run: async () => {
+      // Simulates the server.ts race: handleSessionDestroy calls
+      // sessions.delete(X) which sync-removes then awaits shutdown. While
+      // that await is suspended, a subsequent session_create calls
+      // sessions.put(X, new). The registry must accept the put (map slot
+      // is free after sync-remove) and has(X) must return true from the
+      // destroy path's perspective once its await resolves.
+      const reg = new SessionRegistry<FakeSession>()
+      const oldSession: FakeSession = {
+        id: 'x',
+        shutdownCount: 0,
+        shutdownAt: 0,
+        shutdown: async () => {
+          await sleep(30) // slow shutdown, like killing a codex subprocess
+          oldSession.shutdownCount += 1
+        },
+      } as FakeSession
+      reg.put('x', oldSession, 'conversation')
+
+      const deletePromise = reg.delete('x')
+      // Immediately after kicking off delete, the sync map-remove has
+      // already happened. A concurrent put with the same id must succeed.
+      await sleep(1)
+      const newSession = new FakeSession('x')
+      reg.put('x', newSession, 'conversation')
+
+      await deletePromise
+
+      if (!reg.has('x')) return 'new session missing from registry after delete of old resolved'
+      if (reg.peek('x') !== newSession) return 'registry has old session, not new'
+      if (oldSession.shutdownCount !== 1)
+        return `old session shutdown count ${oldSession.shutdownCount}, expected 1`
+      if (newSession.shutdownCount !== 0) return 'new session was shut down by old delete'
+      return null
+    },
+  },
+  {
+    name: 'sizeOf reports per-pool counts',
+    run: async () => {
+      const reg = new SessionRegistry<FakeSession>()
+      reg.put('c1', new FakeSession('c1'), 'conversation')
+      reg.put('c2', new FakeSession('c2'), 'conversation')
+      reg.put('r1', new FakeSession('r1'), 'routine')
+      if (reg.sizeOf('conversation') !== 2) return `conversation size ${reg.sizeOf('conversation')}`
+      if (reg.sizeOf('routine') !== 1) return `routine size ${reg.sizeOf('routine')}`
+      if (reg.sizeOf('ephemeral') !== 0) return `ephemeral size ${reg.sizeOf('ephemeral')}`
+      return null
+    },
+  },
+  {
+    name: 'eviction→replace race: onEvict is skipped when id was re-registered before async eviction fired',
+    run: async () => {
+      // Exercise the runEviction race guard: put() at capacity
+      // sync-removes the victim's map slot and fires async runEviction.
+      // Before that microtask runs, a fresh put(sameId) replaces the
+      // slot with a NEW session. When runEviction resumes, it must
+      // detect the replacement and skip onEvict (which is keyed on id)
+      // so the new session's external bookkeeping isn't wiped.
+      const onEvictCalls: Array<{ id: string; session: FakeSession }> = []
+      const reg = new SessionRegistry<FakeSession>({
+        pools: {
+          conversation: { maxSessions: 1, recencyFloorMs: 0 },
+          routine: { maxSessions: 40, recencyFloorMs: 0 },
+          ephemeral: { maxSessions: 20, recencyFloorMs: 0 },
+        },
+        onEvict: (id, session) => {
+          onEvictCalls.push({ id, session })
+        },
+      })
+      const victim = new FakeSession('a')
+      reg.put('a', victim, 'conversation')
+      await sleep(5)
+      // This triggers eviction of 'a' (async). The map slot for 'a' is
+      // already free by the time put('b') returns.
+      const bSession = new FakeSession('b')
+      reg.put('b', bSession, 'conversation')
+      // Immediately re-register 'a' with a fresh session while the
+      // async runEviction for the original 'a' is still pending its
+      // microtask turn. This also forces 'b' to be evicted (pool cap=1).
+      const replacement = new FakeSession('a')
+      reg.put('a', replacement, 'conversation')
+      await sleep(30)
+      // Victim's shutdown must still run (subprocess must be reaped).
+      if (victim.shutdownCount !== 1)
+        return `victim.shutdown called ${victim.shutdownCount}x, expected 1`
+      // Replacement must be untouched — no shutdown, still present.
+      if (replacement.shutdownCount !== 0)
+        return `replacement was shut down ${replacement.shutdownCount}x — id reuse must not cascade`
+      if (reg.peek('a') !== replacement) return 'replacement was evicted from registry'
+      // Critical assertion: onEvict must NOT have been called for 'a',
+      // because by the time the async body resumed, the 'a' slot pointed
+      // to the replacement. Calling onEvict('a', …) would wipe the
+      // replacement's external state. A single onEvict('b', …) IS
+      // expected since 'b' legitimately got evicted to make room for the
+      // replacement.
+      const aEvicts = onEvictCalls.filter((c) => c.id === 'a').length
+      if (aEvicts !== 0) return `onEvict fired for 'a' ${aEvicts}x despite id reuse, expected 0`
+      const bEvicts = onEvictCalls.filter((c) => c.id === 'b' && c.session === bSession).length
+      if (bEvicts !== 1)
+        return `onEvict should have fired exactly once for 'b', got ${bEvicts}`
+      return null
+    },
+  },
+]
+
+async function main(): Promise<void> {
+  let failed = 0
+  for (const c of cases) {
+    try {
+      const err = await c.run()
+      if (err === null) {
+        console.log(`✓ session-registry: ${c.name}`)
+      } else {
+        failed++
+        console.error(`✗ session-registry: ${c.name} — ${err}`)
+      }
+    } catch (err) {
+      failed++
+      console.error(`✗ session-registry: ${c.name} (threw)`, err)
+    }
+  }
+  if (failed > 0) {
+    console.error(`\n${failed}/${cases.length} session-registry checks failed`)
+    process.exit(1)
+  }
+  console.log(`\nAll ${cases.length} session-registry checks passed`)
+}
+
+void main()
diff --git a/packages/agent-core/src/harness/__fixtures__/check-mcp-shim-runtime.ts b/packages/agent-core/src/harness/__fixtures__/check-mcp-shim-runtime.ts
new file mode 100644
index 0000000..1691a9a
--- /dev/null
+++ b/packages/agent-core/src/harness/__fixtures__/check-mcp-shim-runtime.ts
@@ -0,0 +1,557 @@
+/**
+ * MCP shim runtime integration checks — exercises the connection state
+ * machine against a spawned compiled shim, not just the initialize
+ * probe.
+ *
+ * Run with:  pnpm --filter @anton/agent-core check:mcp-shim-runtime
+ *
+ * These checks used to live as comments in the "why did this miss it"
+ * post-mortem: every one of them targets a failure mode that the
+ * pre-rewrite shim hung on.
+ *
+ * Covered:
+ *   - Happy path: shim connects, auths, forwards tools/list, relays
+ *     result back on stdout.
+ *   - Structured log notifications from shim reach the fake server.
+ *   - Socket close mid-session: every pending request fails FAST with a
+ *     structured `ipc disconnected: …` message rather than timing out,
+ *     and the next call transparently reconnects.
+ *   - Server-sent `bye`: same teardown behavior but reason surfaces as
+ *     `ipc disconnected: bye: …`.
+ *   - Ping timeout: if the server stalls after auth, the shim's
+ *     periodic ping detects it within (interval + pingTimeout) and
+ *     transitions to lost without waiting on any outstanding request.
+ */
+
+import { spawn, type ChildProcess } from 'node:child_process'
+import { existsSync, mkdtempSync, rmSync } from 'node:fs'
+import * as net from 'node:net'
+import { tmpdir } from 'node:os'
+import { dirname, join } from 'node:path'
+import { createInterface } from 'node:readline'
+import { fileURLToPath } from 'node:url'
+
+const __dirname = fileURLToPath(new URL('.', import.meta.url))
+const COMPILED_SHIM = join(__dirname, '../../../dist/harness/anton-mcp-shim.js')
+
+if (!existsSync(COMPILED_SHIM)) {
+  console.error(`✗ compiled shim missing at ${COMPILED_SHIM}`)
+  console.error('  Run `pnpm --filter @anton/agent-core build` first.')
+  process.exit(1)
+}
+
+// ── Fake IPC server ─────────────────────────────────────────────────
+
+interface IncomingFrame {
+  id?: number | string
+  method?: string
+  params?: Record<string, unknown>
+}
+
+interface FakeServerOpts {
+  /** Accept auth and respond. Default true. */
+  autoAuth?: boolean
+  /** If set, every tools/list returns this many fake tools. */
+  toolsCount?: number
+  /**
+   * If set, server responds to tools/call with `{content: [{type: 'text', text}]}`.
+   * Otherwise, tools/call is ignored (never replied) — used to simulate a stall.
+   */
+  toolsCallReply?: string | null
+  /** Never reply to ping. Used to trigger the ping timeout transition. */
+  ignorePing?: boolean
+}
+
+interface ConnCtx {
+  socket: net.Socket
+  sessionId: string | null
+  receivedLogs: Array<{ level: string; msg: string; fields: Record<string, unknown> }>
+  receivedPings: number
+  receivedToolCalls: Array<{ id: number | string; name: string }>
+}
+
+class FakeServer {
+  readonly socketPath: string
+  private server: net.Server
+  readonly conns = new Set<ConnCtx>()
+  private opts: FakeServerOpts
+
+  constructor(opts: FakeServerOpts = {}) {
+    this.opts = { autoAuth: true, toolsCount: 2, toolsCallReply: 'ok', ...opts }
+    const dir = mkdtempSync(join(tmpdir(), 'anton-shim-rt-'))
+    this.socketPath = join(dir, 'ipc.sock')
+    this.server = net.createServer((sock) => this.onConnection(sock))
+  }
+
+  async start(): Promise<void> {
+    await new Promise<void>((resolve) => this.server.listen(this.socketPath, resolve))
+  }
+
+  async stop(): Promise<void> {
+    for (const ctx of this.conns) {
+      try {
+        ctx.socket.destroy()
+      } catch {
+        /* */
+      }
+    }
+    this.conns.clear()
+    await new Promise<void>((resolve) => this.server.close(() => resolve()))
+    try {
+      rmSync(dirname(this.socketPath), { recursive: true, force: true })
+    } catch {
+      /* */
+    }
+  }
+
+  /** Kill every active shim connection (but keep accepting new ones). */
+  dropAllConnections(): void {
+    for (const ctx of this.conns) {
+      try {
+        ctx.socket.destroy()
+      } catch {
+        /* */
+      }
+    }
+    this.conns.clear()
+  }
+
+  /** Send `bye` notification to every active shim connection. */
+  sendByeAll(reason = 'test'): void {
+    const frame = JSON.stringify({ jsonrpc: '2.0', method: 'bye', params: { reason } })
+    for (const ctx of this.conns) {
+      try {
+        ctx.socket.write(`${frame}\n`)
+        ctx.socket.end()
+      } catch {
+        /* */
+      }
+    }
+  }
+
+  setOpts(update: Partial<FakeServerOpts>): void {
+    this.opts = { ...this.opts, ...update }
+  }
+
+  private onConnection(sock: net.Socket): void {
+    const ctx: ConnCtx = {
+      socket: sock,
+      sessionId: null,
+      receivedLogs: [],
+      receivedPings: 0,
+      receivedToolCalls: [],
+    }
+    this.conns.add(ctx)
+
+    const rl = createInterface({ input: sock })
+    rl.on('line', (line) => {
+      let frame: IncomingFrame
+      try {
+        frame = JSON.parse(line)
+      } catch {
+        return
+      }
+      this.handleFrame(ctx, frame)
+    })
+
+    sock.on('close', () => {
+      this.conns.delete(ctx)
+    })
+    sock.on('error', () => {
+      /* ignore — test-scoped */
+    })
+  }
+
+  private handleFrame(ctx: ConnCtx, frame: IncomingFrame): void {
+    // Auth
+    if (frame.method === 'auth' && frame.id === 0) {
+      if (!this.opts.autoAuth) {
+        this.reply(ctx, 0, { ok: false })
+        return
+      }
+      ctx.sessionId = (frame.params?.sessionId as string) ?? null
+      this.reply(ctx, 0, { ok: true })
+      return
+    }
+
+    // Notifications from shim (no id)
+    if (frame.id === undefined || frame.id === null) {
+      if (frame.method === 'log' && frame.params) {
+        ctx.receivedLogs.push({
+          level: (frame.params.level as string) ?? 'info',
+          msg: (frame.params.msg as string) ?? '',
+          fields: (frame.params.fields as Record<string, unknown>) ?? {},
+        })
+      }
+      return
+    }
+
+    // Requests
+    if (frame.method === 'ping') {
+      ctx.receivedPings += 1
+      if (!this.opts.ignorePing) this.reply(ctx, frame.id as number, {})
+      return
+    }
+
+    if (frame.method === 'tools/list') {
+      const tools = []
+      for (let i = 0; i < (this.opts.toolsCount ?? 0); i++) {
+        tools.push({
+          name: `fake_tool_${i}`,
+          description: `fake ${i}`,
+          inputSchema: { type: 'object', properties: {} },
+        })
+      }
+      this.reply(ctx, frame.id as number, { tools })
+      return
+    }
+
+    if (frame.method === 'tools/call') {
+      ctx.receivedToolCalls.push({
+        id: frame.id as number,
+        name: (frame.params?.name as string) ?? 'unknown',
+      })
+      if (this.opts.toolsCallReply === null || this.opts.toolsCallReply === undefined) return
+      this.reply(ctx, frame.id as number, {
+        content: [{ type: 'text', text: this.opts.toolsCallReply }],
+      })
+      return
+    }
+  }
+
+  private reply(ctx: ConnCtx, id: string | number, result: unknown): void {
+    try {
+      ctx.socket.write(`${JSON.stringify({ jsonrpc: '2.0', id, result })}\n`)
+    } catch {
+      /* */
+    }
+  }
+}
+
+// ── Shim process wrapper ────────────────────────────────────────────
+
+interface ShimHandle {
+  proc: ChildProcess
+  send(frame: Record<string, unknown>): void
+  awaitResponse(matchId: string | number, timeoutMs?: number): Promise<Record<string, unknown>>
+  stderrLines: string[]
+  kill(): void
+}
+
+function startShim(socketPath: string): ShimHandle {
+  const proc = spawn(process.execPath, [COMPILED_SHIM], {
+    stdio: ['pipe', 'pipe', 'pipe'],
+    env: {
+      ...process.env,
+      ANTON_SOCK: socketPath,
+      ANTON_SESSION: 'sess_runtime_test',
+      ANTON_AUTH: 'tok_runtime_test',
+    },
+  })
+
+  const inbound: Array<Record<string, unknown>> = []
+  const waiters: Array<{ id: string | number; resolve: (frame: Record<string, unknown>) => void }> =
+    []
+  const stderrLines: string[] = []
+
+  proc.stdout!.setEncoding('utf8')
+  const stdoutRl = createInterface({ input: proc.stdout! })
+  stdoutRl.on('line', (line) => {
+    if (!line) return
+    try {
+      const frame = JSON.parse(line) as Record<string, unknown>
+      inbound.push(frame)
+      for (let i = waiters.length - 1; i >= 0; i--) {
+        if (waiters[i].id === frame.id) {
+          waiters[i].resolve(frame)
+          waiters.splice(i, 1)
+        }
+      }
+    } catch {
+      /* ignore malformed */
+    }
+  })
+
+  proc.stderr!.setEncoding('utf8')
+  const stderrRl = createInterface({ input: proc.stderr! })
+  stderrRl.on('line', (line) => {
+    if (line) stderrLines.push(line)
+  })
+
+  return {
+    proc,
+    stderrLines,
+    send(frame) {
+      proc.stdin!.write(`${JSON.stringify(frame)}\n`)
+    },
+    awaitResponse(matchId, timeoutMs = 5_000) {
+      // Search already-received frames first
+      const hit = inbound.find((f) => f.id === matchId)
+      if (hit) return Promise.resolve(hit)
+      return new Promise((resolve, reject) => {
+        const timer = setTimeout(() => {
+          const idx = waiters.findIndex((w) => w.id === matchId)
+          if (idx >= 0) waiters.splice(idx, 1)
+          reject(new Error(`no response for id=${matchId} in ${timeoutMs}ms`))
+        }, timeoutMs)
+        waiters.push({
+          id: matchId,
+          resolve: (frame) => {
+            clearTimeout(timer)
+            resolve(frame)
+          },
+        })
+      })
+    },
+    kill() {
+      try {
+        proc.kill('SIGTERM')
+      } catch {
+        /* */
+      }
+    },
+  }
+}
+
+async function initializeShim(shim: ShimHandle): Promise<void> {
+  shim.send({
+    jsonrpc: '2.0',
+    id: 'init',
+    method: 'initialize',
+    params: {
+      protocolVersion: '2024-11-05',
+      capabilities: {},
+      clientInfo: { name: 'rt-test', version: '0' },
+    },
+  })
+  await shim.awaitResponse('init', 3_000)
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((r) => setTimeout(r, ms))
+}
+
+// ── Cases ────────────────────────────────────────────────────────────
+
+interface Case {
+  name: string
+  run: () => Promise<string | null>
+}
+
+const cases: Case[] = [
+  {
+    name: 'happy path: initialize + tools/list round-trip',
+    run: async () => {
+      const server = new FakeServer({ toolsCount: 3 })
+      await server.start()
+      const shim = startShim(server.socketPath)
+      try {
+        await initializeShim(shim)
+        shim.send({ jsonrpc: '2.0', id: 1, method: 'tools/list' })
+        const res = (await shim.awaitResponse(1, 3_000)) as {
+          result?: { tools?: unknown[] }
+        }
+        const tools = res.result?.tools
+        if (!Array.isArray(tools) || tools.length !== 3) {
+          return `expected 3 tools, got ${tools?.length}`
+        }
+        return null
+      } finally {
+        shim.kill()
+        await server.stop()
+      }
+    },
+  },
+
+  {
+    name: 'shim log notifications reach the server with session context',
+    run: async () => {
+      const server = new FakeServer()
+      await server.start()
+      const shim = startShim(server.socketPath)
+      try {
+        await initializeShim(shim)
+        shim.send({ jsonrpc: '2.0', id: 1, method: 'tools/list' })
+        await shim.awaitResponse(1, 3_000)
+        // Give the shim a tick to flush its post-success log notification.
+        await sleep(50)
+        const ctx = [...server.conns][0]
+        if (!ctx) return 'no authed connection on server'
+        const listLog = ctx.receivedLogs.find((l) => /tools\/list/.test(l.msg))
+        if (!listLog) {
+          return `no tools/list log found (have: ${ctx.receivedLogs.map((l) => l.msg).join(' | ')})`
+        }
+        return null
+      } finally {
+        shim.kill()
+        await server.stop()
+      }
+    },
+  },
+
+  {
+    name: 'connection drop: in-flight call fails fast, next call reconnects',
+    run: async () => {
+      // Start with tools/call replies disabled → the first call stalls
+      // until we drop the connection. After drop, we re-enable replies
+      // and verify the next call succeeds via a reconnected socket.
+      const server = new FakeServer({ toolsCallReply: null })
+      await server.start()
+      const shim = startShim(server.socketPath)
+      try {
+        await initializeShim(shim)
+        // Prime by listing tools (auths the socket).
+        shim.send({ jsonrpc: '2.0', id: 1, method: 'tools/list' })
+        await shim.awaitResponse(1, 3_000)
+
+        // Fire a tools/call that the server will never reply to.
+        shim.send({
+          jsonrpc: '2.0',
+          id: 2,
+          method: 'tools/call',
+          params: { name: 'fake_tool_0', arguments: {} },
+        })
+
+        // Give shim a moment to forward it, then drop the connection.
+        await sleep(100)
+        server.dropAllConnections()
+
+        // The pending call should now fail with the structured disconnect
+        // reason — NOT the 30s timeout.
+        const startedWait = Date.now()
+        const failed = (await shim.awaitResponse(2, 3_000)) as {
+          error?: { message?: string }
+        }
+        const elapsed = Date.now() - startedWait
+        if (!failed.error) return 'expected error response, got none'
+        if (!/ipc disconnected/.test(failed.error.message ?? '')) {
+          return `error message missing 'ipc disconnected': ${failed.error.message}`
+        }
+        if (elapsed > 2_000) return `disconnect rejection took ${elapsed}ms (should be <2s)`
+
+        // Re-enable tools/call replies and make a new call — shim must
+        // auto-reconnect.
+        server.setOpts({ toolsCallReply: 'ok-reconnected' })
+        shim.send({
+          jsonrpc: '2.0',
+          id: 3,
+          method: 'tools/call',
+          params: { name: 'fake_tool_0', arguments: {} },
+        })
+        const recovered = (await shim.awaitResponse(3, 5_000)) as {
+          result?: { content?: Array<{ text?: string }> }
+        }
+        const text = recovered.result?.content?.[0]?.text
+        if (text !== 'ok-reconnected') {
+          return `expected reconnected reply, got: ${JSON.stringify(recovered)}`
+        }
+        return null
+      } finally {
+        shim.kill()
+        await server.stop()
+      }
+    },
+  },
+
+  {
+    name: 'bye notification: in-flight call fails with bye reason, reconnects on next call',
+    run: async () => {
+      const server = new FakeServer({ toolsCallReply: null })
+      await server.start()
+      const shim = startShim(server.socketPath)
+      try {
+        await initializeShim(shim)
+        shim.send({ jsonrpc: '2.0', id: 1, method: 'tools/list' })
+        await shim.awaitResponse(1, 3_000)
+
+        shim.send({
+          jsonrpc: '2.0',
+          id: 2,
+          method: 'tools/call',
+          params: { name: 'fake_tool_0', arguments: {} },
+        })
+        await sleep(100)
+        server.sendByeAll('session_unregistered')
+
+        const failed = (await shim.awaitResponse(2, 3_000)) as {
+          error?: { message?: string }
+        }
+        if (!failed.error) return 'expected error response, got none'
+        const msg = failed.error.message ?? ''
+        if (!/bye: session_unregistered/.test(msg)) {
+          return `error message missing bye reason: ${msg}`
+        }
+
+        server.setOpts({ toolsCallReply: 'ok-after-bye' })
+        shim.send({
+          jsonrpc: '2.0',
+          id: 3,
+          method: 'tools/call',
+          params: { name: 'fake_tool_0', arguments: {} },
+        })
+        const recovered = (await shim.awaitResponse(3, 5_000)) as {
+          result?: { content?: Array<{ text?: string }> }
+        }
+        if (recovered.result?.content?.[0]?.text !== 'ok-after-bye') {
+          return `no reconnect after bye: ${JSON.stringify(recovered)}`
+        }
+        return null
+      } finally {
+        shim.kill()
+        await server.stop()
+      }
+    },
+  },
+
+  {
+    name: 'auth failure returns a clean error, not a hang',
+    run: async () => {
+      const server = new FakeServer({ autoAuth: false })
+      await server.start()
+      const shim = startShim(server.socketPath)
+      try {
+        await initializeShim(shim)
+        shim.send({ jsonrpc: '2.0', id: 1, method: 'tools/list' })
+        const res = (await shim.awaitResponse(1, 5_000)) as {
+          error?: { message?: string }
+        }
+        if (!res.error) return 'expected error response on auth failure'
+        // The sendToAnton path surfaces either the rejected auth message
+        // or the "connect failed" wrapper depending on how the fake
+        // server closes. Either form is fine as long as it's an error,
+        // not a hang.
+        return null
+      } finally {
+        shim.kill()
+        await server.stop()
+      }
+    },
+  },
+]
+
+async function main(): Promise<void> {
+  let failed = 0
+  for (const c of cases) {
+    const startedAt = Date.now()
+    try {
+      const err = await c.run()
+      const dt = Date.now() - startedAt
+      if (err === null) {
+        console.log(`✓ mcp-shim-runtime: ${c.name} (${dt}ms)`)
+      } else {
+        failed++
+        console.error(`✗ mcp-shim-runtime: ${c.name} — ${err} (${dt}ms)`)
+      }
+    } catch (err) {
+      failed++
+      console.error(`✗ mcp-shim-runtime: ${c.name} (threw)`, err)
+    }
+  }
+  if (failed > 0) {
+    console.error(`\n${failed}/${cases.length} mcp-shim-runtime checks failed`)
+    process.exit(1)
+  }
+  console.log(`\nAll ${cases.length} mcp-shim-runtime checks passed`)
+}
+
+void main()
diff --git a/packages/agent-core/src/harness/__fixtures__/check-mcp-shim.ts b/packages/agent-core/src/harness/__fixtures__/check-mcp-shim.ts
new file mode 100644
index 0000000..35aec7e
--- /dev/null
+++ b/packages/agent-core/src/harness/__fixtures__/check-mcp-shim.ts
@@ -0,0 +1,167 @@
+/**
+ * MCP shim integration checks.
+ *
+ * Run with:  pnpm --filter @anton/agent-core check:mcp-shim
+ *
+ * The check depends on the compiled shim at `dist/harness/anton-mcp-shim.js`.
+ * The npm script runs `tsc` first to guarantee it exists.
+ *
+ * Covered:
+ *   - `buildMcpSpawnConfig()` returns `process.execPath` + an absolute shim
+ *     path that exists on disk.
+ *   - `probeMcpShim()` round-trips an `initialize` against the real shim
+ *     and returns `{ ok: true, version }`.
+ *   - Version reported by the shim matches `getExpectedShimVersion()` —
+ *     guards against forgetting to bump the constant in sync with the
+ *     package version.
+ *   - Probe fails cleanly (not hangs) when spawned with a bogus binary.
+ *   - Probe fails cleanly when the shim exits without speaking
+ *     initialize (here: missing env vars).
+ */
+
+import { existsSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { fileURLToPath } from 'node:url'
+import {
+  buildMcpSpawnConfig,
+  getExpectedShimVersion,
+  probeMcpShim,
+  type McpSpawnConfig,
+} from '../mcp-spawn-config.js'
+
+// Resolve the compiled shim path from the source location. The probe
+// uses `import.meta.url` to locate its shim, but when running through
+// tsx the source lives under `src/`, not `dist/`. We override the path
+// in the probe-call sites below so we can run this check without
+// shipping a .js next to the .ts source.
+const __dirname = fileURLToPath(new URL('.', import.meta.url))
+const COMPILED_SHIM = join(__dirname, '../../../dist/harness/anton-mcp-shim.js')
+
+if (!existsSync(COMPILED_SHIM)) {
+  console.error(`✗ compiled shim missing at ${COMPILED_SHIM}`)
+  console.error('  Run `pnpm --filter @anton/agent-core build` first.')
+  process.exit(1)
+}
+
+const compiledConfig: McpSpawnConfig = {
+  command: process.execPath,
+  args: [COMPILED_SHIM],
+  shimPath: COMPILED_SHIM,
+}
+
+interface Case {
+  name: string
+  run: () => Promise<string | null>
+}
+
+const cases: Case[] = [
+  {
+    name: 'buildMcpSpawnConfig returns absolute command + args',
+    run: async () => {
+      const cfg = buildMcpSpawnConfig()
+      if (cfg.command !== process.execPath) return `command was ${cfg.command}, expected execPath`
+      if (cfg.args.length !== 1) return `expected 1 arg, got ${cfg.args.length}`
+      if (!cfg.args[0]?.endsWith('anton-mcp-shim.js'))
+        return `arg[0] did not end with shim name: ${cfg.args[0]}`
+      if (cfg.shimPath !== cfg.args[0])
+        return 'shimPath should equal args[0] for diagnostic clarity'
+      return null
+    },
+  },
+  {
+    name: 'getExpectedShimVersion returns a non-empty string',
+    run: async () => {
+      const v = getExpectedShimVersion()
+      if (typeof v !== 'string') return `type was ${typeof v}`
+      if (v.length === 0) return 'version was empty'
+      return null
+    },
+  },
+  {
+    name: 'probe round-trips initialize against compiled shim',
+    run: async () => {
+      const result = await probeMcpShim(compiledConfig, 5_000)
+      if (!result.ok) {
+        return `probe failed: ${result.error}\n  stderrTail: ${result.stderrTail.join('\n  ')}`
+      }
+      if (!result.version) return 'probe returned ok but no version'
+      if (result.serverName !== 'anton-mcp-shim')
+        return `unexpected serverName: ${result.serverName}`
+      if (!result.protocolVersion) return 'probe returned ok but no protocolVersion'
+      return null
+    },
+  },
+  {
+    name: 'shim version matches getExpectedShimVersion',
+    run: async () => {
+      const result = await probeMcpShim(compiledConfig, 5_000)
+      if (!result.ok) return `probe failed: ${result.error}`
+      const expected = getExpectedShimVersion()
+      if (result.version !== expected)
+        return `shim reported ${result.version}, expected ${expected}`
+      return null
+    },
+  },
+  {
+    name: 'probe fails cleanly (not hang) on missing binary',
+    run: async () => {
+      const result = await probeMcpShim(
+        {
+          command: '/definitely/not/a/real/node/binary',
+          args: [COMPILED_SHIM],
+          shimPath: COMPILED_SHIM,
+        },
+        2_000,
+      )
+      if (result.ok) return 'probe should have failed'
+      if (!/spawn failed|ENOENT/.test(result.error))
+        return `expected spawn failure, got: ${result.error}`
+      if (result.durationMs > 1_500) return `took too long: ${result.durationMs}ms`
+      return null
+    },
+  },
+  {
+    name: 'probe fails cleanly on missing shim path',
+    run: async () => {
+      const result = await probeMcpShim(
+        {
+          command: process.execPath,
+          args: [join(dirname(COMPILED_SHIM), 'does-not-exist.js')],
+          shimPath: join(dirname(COMPILED_SHIM), 'does-not-exist.js'),
+        },
+        2_000,
+      )
+      if (result.ok) return 'probe should have failed'
+      // Node with a missing script exits with a non-zero code and no
+      // MCP output, so we expect the "exited before initialize" branch.
+      if (!/exited before initialize/.test(result.error))
+        return `expected early-exit error, got: ${result.error}`
+      return null
+    },
+  },
+]
+
+async function main(): Promise<void> {
+  let failed = 0
+  for (const c of cases) {
+    try {
+      const err = await c.run()
+      if (err === null) {
+        console.log(`✓ mcp-shim: ${c.name}`)
+      } else {
+        failed++
+        console.error(`✗ mcp-shim: ${c.name} — ${err}`)
+      }
+    } catch (err) {
+      failed++
+      console.error(`✗ mcp-shim: ${c.name} (threw)`, err)
+    }
+  }
+  if (failed > 0) {
+    console.error(`\n${failed}/${cases.length} mcp-shim checks failed`)
+    process.exit(1)
+  }
+  console.log(`\nAll ${cases.length} mcp-shim checks passed`)
+}
+
+void main()
diff --git a/packages/agent-core/src/harness/anton-mcp-shim.ts b/packages/agent-core/src/harness/anton-mcp-shim.ts
index 0e115ac..e21c422 100644
--- a/packages/agent-core/src/harness/anton-mcp-shim.ts
+++ b/packages/agent-core/src/harness/anton-mcp-shim.ts
@@ -2,44 +2,318 @@
 /**
  * Anton MCP Shim — standalone MCP server that relays tool calls to Anton.
  *
- * Spawned by Claude Code as an MCP child process. Communicates with
- * the Anton server over a unix domain socket for tool listing/execution.
- *
- * Protocol: JSON-RPC 2.0 over stdio, MCP version 2024-11-05
+ * Spawned by a harness CLI (Codex / Claude Code) as an MCP child process.
+ * Speaks JSON-RPC 2.0 / MCP 2024-11-05 on stdio to the CLI, and a
+ * newline-delimited JSON dialect to the Anton server over a unix domain
+ * socket.
  *
  * Environment:
  *   ANTON_SOCK    — path to Anton's harness IPC unix socket
  *   ANTON_SESSION — session ID for tool scoping
  *   ANTON_AUTH    — per-session token used to authenticate with Anton
+ *
+ * ─────────────────────────────────────────────────────────────────────
+ * Connection state machine
+ * ─────────────────────────────────────────────────────────────────────
+ *
+ *          ┌──────┐         first call           ┌────────────┐
+ *          │ idle │ ─────────────────────────▶   │ connecting │
+ *          └──────┘                              └─────┬──────┘
+ *                ▲                                     │
+ *                │                                auth OK
+ *                │                                     ▼
+ *          connect fail                          ┌──────────┐
+ *                │     ◀── socket close/end ── │  authed  │
+ *                │     ◀── ping timeout ───── └──────────┘
+ *                │     ◀── server "bye" ─────
+ *                ▼
+ *          ┌──────────┐  ensureAuthed() on next call,   ┌────────────┐
+ *          │   lost   │ ───────── backoff ────────────▶│ connecting │
+ *          └──────────┘                                 └────────────┘
+ *
+ * The previous implementation stored the socket in a module-level
+ * `socket: net.Socket | null` and only checked `!socket` before writing.
+ * It had no `'close'` / `'end'` handler, no drain-on-disconnect, and no
+ * reconnect path. A single half-close left every subsequent `tools/call`
+ * writing to a dead fd and hanging 30s on the per-request timeout — and
+ * even after the timeout fired, the shim never recovered for the rest of
+ * its life. This rewrite makes the connection an explicitly-owned
+ * resource with one transition point (`transitionToLost`) that every
+ * failure mode funnels into.
+ *
+ * Everything logs as newline-delimited JSON on stderr (the harness CLI
+ * captures our stderr) and — when we have a live IPC connection — also
+ * mirrors to the Anton server as `log` notifications so the server can
+ * tag the entry with our sessionId and surface it under the `mcp-shim`
+ * module.
  */
 
+import { readFileSync } from 'node:fs'
 import * as net from 'node:net'
 import * as readline from 'node:readline'
+import { fileURLToPath } from 'node:url'
 
 const ANTON_SOCK = process.env.ANTON_SOCK
 const ANTON_SESSION = process.env.ANTON_SESSION
 const ANTON_AUTH = process.env.ANTON_AUTH
 
+// Resolved once at module load. Host logs this on `initialize` so we can
+// detect version skew between the server binary and the shim on disk
+// (e.g. a half-synced deploy).
+const SHIM_VERSION: string = (() => {
+  try {
+    const pkgPath = fileURLToPath(new URL('../../package.json', import.meta.url))
+    const { version } = JSON.parse(readFileSync(pkgPath, 'utf8')) as { version?: string }
+    return typeof version === 'string' ? version : 'unknown'
+  } catch {
+    return 'unknown'
+  }
+})()
+
 if (!ANTON_SOCK || !ANTON_SESSION || !ANTON_AUTH) {
   process.stderr.write(
-    'anton-mcp-shim: ANTON_SOCK, ANTON_SESSION and ANTON_AUTH env vars required\n',
+    `${JSON.stringify({
+      ts: new Date().toISOString(),
+      level: 'error',
+      source: 'anton-mcp-shim',
+      msg: 'missing required env vars',
+      required: ['ANTON_SOCK', 'ANTON_SESSION', 'ANTON_AUTH'],
+    })}\n`,
   )
   process.exit(1)
 }
 
-// ── IPC connection to Anton server ──────────────────────────────────
+// ── Timing constants ────────────────────────────────────────────────
 
-/** Reserved JSON-RPC id for the auth handshake. Regular requests start at 1. */
 const AUTH_ID = 0
+const AUTH_TIMEOUT_MS = 5_000
+const DEFAULT_REQUEST_TIMEOUT_MS = 30_000
+const STREAMING_REQUEST_TIMEOUT_MS = 30 * 60_000
+const PING_INTERVAL_MS = 20_000
+const PING_TIMEOUT_MS = 5_000
+/** Backoff schedule in ms, indexed by attempt number (clamped to length-1). */
+const RECONNECT_BACKOFF_MS = [100, 250, 500, 1_000, 2_000, 5_000] as const
+
+// ── Logging ─────────────────────────────────────────────────────────
+
+type LogLevel = 'debug' | 'info' | 'warn' | 'error'
+
+/**
+ * Structured log. Always writes a JSON line to stderr; additionally
+ * mirrors to the Anton server as a `log` notification when we have a
+ * live authed IPC connection (the server module `mcp-shim` forwards it
+ * into our normal logger with sessionId attached).
+ *
+ * Never throws — logging must not become the failure mode of the thing
+ * it's describing.
+ */
+function log(level: LogLevel, msg: string, fields?: Record<string, unknown>): void {
+  const entry = {
+    ts: new Date().toISOString(),
+    level,
+    source: 'anton-mcp-shim',
+    sessionId: ANTON_SESSION,
+    shimVersion: SHIM_VERSION,
+    msg,
+    ...(fields ?? {}),
+  }
+  try {
+    process.stderr.write(`${JSON.stringify(entry)}\n`)
+  } catch {
+    /* best-effort */
+  }
+  if (state.tag === 'authed') {
+    try {
+      const frame = JSON.stringify({
+        jsonrpc: '2.0',
+        method: 'log',
+        params: { level, msg, fields: fields ?? {} },
+      })
+      state.socket.write(`${frame}\n`, () => {
+        /* swallow — we already persisted to stderr */
+      })
+    } catch {
+      /* best-effort */
+    }
+  }
+}
+
+// ── Connection state machine ────────────────────────────────────────
+
+type ConnState =
+  | { tag: 'idle' }
+  | { tag: 'connecting'; attempt: number; startedAt: number }
+  | { tag: 'authed'; socket: net.Socket; since: number }
+  | { tag: 'lost'; reason: string; at: number; attempt: number }
+
+interface PendingEntry {
+  method: string
+  sentAt: number
+  resolve: (result: unknown) => void
+  reject: (err: Error) => void
+  timeout: NodeJS.Timeout
+}
 
-let socket: net.Socket | null = null
-const pendingRequests = new Map<string | number, (result: unknown) => void>()
+let state: ConnState = { tag: 'idle' }
+/** In-flight connect attempt, so concurrent ensureAuthed() callers share it. */
+let connectInflight: Promise<net.Socket> | null = null
+let pingTimer: NodeJS.Timeout | null = null
+const pendingRequests = new Map<string | number, PendingEntry>()
+let requestId = 1
+
+function clearPingTimer(): void {
+  if (pingTimer) {
+    clearInterval(pingTimer)
+    pingTimer = null
+  }
+}
+
+/**
+ * Single funnel for every connection failure mode: post-auth socket
+ * `close`/`end`/`error`, a server-sent `bye`, a ping timeout. Destroys
+ * the socket, drains every pending request with a structured rejection,
+ * and latches state so the next `ensureAuthed()` knows to reconnect
+ * (with backoff).
+ *
+ * Idempotent — if we're already in `lost`, does nothing.
+ */
+function transitionToLost(reason: string): void {
+  if (state.tag === 'lost') return
+
+  const prevTag = state.tag
+  const attempt = state.tag === 'connecting' ? state.attempt : 0
+  const prevSocket = state.tag === 'authed' ? state.socket : null
+  const pendingCount = pendingRequests.size
+  const livedMs = state.tag === 'authed' ? Date.now() - state.since : 0
+
+  clearPingTimer()
+
+  if (prevSocket) {
+    try {
+      prevSocket.destroy()
+    } catch {
+      /* already dead, fine */
+    }
+  }
+
+  // Drain pending with a structured error — every stuck `tools/call`
+  // fails fast with a clear reason instead of hanging on per-request
+  // timeout.
+  const err = new Error(`ipc disconnected: ${reason}`)
+  for (const entry of pendingRequests.values()) {
+    clearTimeout(entry.timeout)
+    try {
+      entry.reject(err)
+    } catch {
+      /* best-effort */
+    }
+  }
+  pendingRequests.clear()
+
+  state = { tag: 'lost', reason, at: Date.now(), attempt }
+  connectInflight = null
+
+  log('warn', 'connection lost', {
+    reason,
+    prevTag,
+    pendingDrained: pendingCount,
+    livedMs,
+  })
+}
+
+/**
+ * Ensure we have a live, authed IPC socket. Shared across concurrent
+ * callers (via `connectInflight`) so a burst of tool calls triggers one
+ * connect attempt, not N.
+ *
+ * Backoff is applied on transitions from `lost` — the attempt counter
+ * persists across losses so a flapping server doesn't get hammered.
+ */
+async function ensureAuthed(): Promise<net.Socket> {
+  if (state.tag === 'authed') return state.socket
+  if (connectInflight) return connectInflight
+
+  const lastAttempt = state.tag === 'lost' ? state.attempt : 0
+  const nextAttempt = lastAttempt + 1
+
+  connectInflight = (async (): Promise<net.Socket> => {
+    if (state.tag === 'lost' && state.attempt > 0) {
+      const delayMs =
+        RECONNECT_BACKOFF_MS[Math.min(state.attempt - 1, RECONNECT_BACKOFF_MS.length - 1)]
+      log('debug', 'reconnect backoff', { attempt: nextAttempt, delayMs })
+      await new Promise((r) => setTimeout(r, delayMs))
+    }
+
+    const startedAt = Date.now()
+    state = { tag: 'connecting', attempt: nextAttempt, startedAt }
+    log('debug', 'connecting', { attempt: nextAttempt, sock: ANTON_SOCK })
+
+    try {
+      const sock = await doConnect()
+      // A server-sent `bye` arriving in the same readline chunk as our
+      // auth_ok would have run `transitionToLost` synchronously — setting
+      // state='lost' — before this microtask resumed. Blindly assigning
+      // state='authed' here would clobber that transition and hand out a
+      // socket the server has already half-closed. Detect it and treat
+      // the attempt as failed so the caller sees a clean error and the
+      // backoff/reconnect path runs. Double-cast escapes control-flow
+      // narrowing from the `state = { tag: 'connecting', ... }` above;
+      // `state` is a module-level `let` and may have been mutated
+      // synchronously inside readline while we were awaited.
+      const stateNow = state as unknown as ConnState
+      if (stateNow.tag === 'lost') {
+        try {
+          sock.destroy()
+        } catch {
+          /* */
+        }
+        throw new Error(`connection lost during handshake: ${stateNow.reason}`)
+      }
+      const since = Date.now()
+      state = { tag: 'authed', socket: sock, since }
+      startPing()
+      log('info', 'connection authed', {
+        attempt: nextAttempt,
+        handshakeMs: since - startedAt,
+      })
+      return sock
+    } catch (err) {
+      const message = (err as Error).message
+      state = {
+        tag: 'lost',
+        reason: `connect failed: ${message}`,
+        at: Date.now(),
+        attempt: nextAttempt,
+      }
+      log('error', 'connect failed', { attempt: nextAttempt, error: message })
+      throw err
+    } finally {
+      connectInflight = null
+    }
+  })()
 
-function connectToAnton(): Promise<net.Socket> {
+  return connectInflight
+}
+
+/**
+ * Open a single UDS connection, perform the auth handshake, and wire
+ * up every failure signal (`error`, `close`, `end`, server `bye`) to
+ * `transitionToLost`. Returns the socket on auth success; rejects on
+ * any failure before auth completes.
+ */
+function doConnect(): Promise<net.Socket> {
   return new Promise((resolve, reject) => {
     let authed = false
+    let settled = false
     let authTimer: NodeJS.Timeout | null = null
 
+    const settle = (fn: () => void) => {
+      if (settled) return
+      settled = true
+      if (authTimer) clearTimeout(authTimer)
+      fn()
+    }
+
     const sock = net.connect(ANTON_SOCK!, () => {
       const authMsg = JSON.stringify({
         jsonrpc: '2.0',
@@ -50,20 +324,42 @@ function connectToAnton(): Promise<net.Socket> {
       sock.write(`${authMsg}\n`)
       authTimer = setTimeout(() => {
         if (!authed) {
-          process.stderr.write('anton-mcp-shim: auth handshake timed out\n')
-          sock.destroy()
-          reject(new Error('auth timeout'))
+          settle(() => {
+            try {
+              sock.destroy()
+            } catch {
+              /* */
+            }
+            reject(new Error('auth handshake timed out'))
+          })
         }
-      }, 5_000)
+      }, AUTH_TIMEOUT_MS)
     })
 
     sock.on('error', (err) => {
-      if (authTimer) clearTimeout(authTimer)
-      process.stderr.write(`anton-mcp-shim: socket error: ${err.message}\n`)
-      if (!authed) reject(err)
+      if (!authed) {
+        settle(() => reject(err))
+      } else {
+        // Post-auth error — funnel into the single transition path.
+        // We also expect 'close'/'end' to fire immediately after; the
+        // idempotence of transitionToLost handles the overlap.
+        transitionToLost(`socket error: ${err.message}`)
+      }
     })
 
-    // Read responses from Anton server (newline-delimited JSON)
+    sock.on('close', (hadError) => {
+      if (!authed) {
+        settle(() => reject(new Error(`socket closed before auth${hadError ? ' (error)' : ''}`)))
+      } else {
+        transitionToLost(`socket closed${hadError ? ' (error)' : ''}`)
+      }
+    })
+
+    sock.on('end', () => {
+      if (authed) transitionToLost('socket ended by peer')
+    })
+
+    // Read responses / server notifications line-by-line.
     const rl = readline.createInterface({ input: sock })
     rl.on('line', (line) => {
       let msg: {
@@ -71,199 +367,327 @@ function connectToAnton(): Promise<net.Socket> {
         method?: string
         result?: { ok?: boolean }
         error?: { message?: string }
-        params?: { _progressToken?: string | number; message?: string; progress?: number }
+        params?: { _progressToken?: string | number; message?: string; progress?: number; reason?: string }
       }
       try {
         msg = JSON.parse(line)
       } catch {
-        // Ignore malformed responses
+        log('debug', 'dropped malformed line from server', { line: line.slice(0, 200) })
         return
       }
 
+      // Auth reply
       if (!authed && msg.id === AUTH_ID) {
-        if (authTimer) clearTimeout(authTimer)
         if (msg.result?.ok === true) {
           authed = true
-          socket = sock
-          resolve(sock)
+          settle(() => resolve(sock))
         } else {
           const message = msg.error?.message || 'auth rejected'
-          process.stderr.write(`anton-mcp-shim: ${message}\n`)
-          sock.destroy()
-          reject(new Error(message))
+          settle(() => {
+            try {
+              sock.destroy()
+            } catch {
+              /* */
+            }
+            reject(new Error(message))
+          })
         }
         return
       }
 
-      // Progress frames from Anton — forward as MCP notifications/progress
-      // to whatever client is driving the shim (Codex / Claude Code).
-      // These frames have no `id` (they are notifications on the Anton side
-      // too); the caller receives them with the original progressToken so
-      // it can associate the update with the original tools/call.
-      if (msg.method === 'progress' && msg.params) {
-        const token = msg.params._progressToken
-        const message = msg.params.message ?? ''
-        const progress = msg.params.progress
-        if (token !== undefined) {
-          const notif = JSON.stringify({
-            jsonrpc: '2.0',
-            method: 'notifications/progress',
-            params:
-              progress !== undefined
-                ? { progressToken: token, progress, message }
-                : { progressToken: token, message },
-          })
-          process.stdout.write(`${notif}\n`)
+      // Server → shim notifications (no id)
+      if ((msg.id === undefined || msg.id === null) && typeof msg.method === 'string') {
+        if (msg.method === 'bye') {
+          const reason = msg.params?.reason ?? 'server requested disconnect'
+          log('info', 'received bye from server', { reason })
+          // Transition explicitly so pending calls fail with the server's
+          // reason; the socket close that follows is then a no-op.
+          transitionToLost(`bye: ${reason}`)
+          return
         }
+
+        if (msg.method === 'progress' && msg.params) {
+          const token = msg.params._progressToken
+          const message = msg.params.message ?? ''
+          const progress = msg.params.progress
+          if (token !== undefined) {
+            const notif = JSON.stringify({
+              jsonrpc: '2.0',
+              method: 'notifications/progress',
+              params:
+                progress !== undefined
+                  ? { progressToken: token, progress, message }
+                  : { progressToken: token, message },
+            })
+            process.stdout.write(`${notif}\n`)
+          }
+          return
+        }
+
+        // Unknown notification — log once at debug so it's discoverable
+        // without being noisy.
+        log('debug', 'unknown server notification', { method: msg.method })
         return
       }
 
+      // Reply to a pending request
       if (msg.id !== undefined && msg.id !== null) {
-        const resolver = pendingRequests.get(msg.id)
-        if (resolver) {
+        const entry = pendingRequests.get(msg.id)
+        if (entry) {
           pendingRequests.delete(msg.id)
-          resolver(msg.result)
+          clearTimeout(entry.timeout)
+          if (msg.error) {
+            entry.reject(new Error(msg.error.message || 'server error'))
+          } else {
+            entry.resolve(msg.result)
+          }
         }
       }
     })
   })
 }
 
-let requestId = 1
+function startPing(): void {
+  clearPingTimer()
+  pingTimer = setInterval(() => {
+    if (state.tag !== 'authed') return
+    const startedAt = Date.now()
+    sendToAnton('ping', {}, undefined, PING_TIMEOUT_MS)
+      .then(() => {
+        log('debug', 'ping ok', { rttMs: Date.now() - startedAt })
+      })
+      .catch((err) => {
+        if (state.tag === 'authed') {
+          transitionToLost(`ping failed: ${(err as Error).message}`)
+        }
+      })
+  }, PING_INTERVAL_MS)
+  // Don't keep the event loop alive on the ping alone — if stdin closes
+  // and no other work is pending, we should exit cleanly.
+  pingTimer.unref?.()
+}
 
+/**
+ * Send a request to Anton over IPC. Handles reconnect on a lost
+ * connection, per-request timeout, and write-error propagation. Every
+ * path either resolves with the reply's `result` or rejects with an
+ * Error that carries enough context to diagnose — no silent hangs.
+ */
 async function sendToAnton(
   method: string,
   params: Record<string, unknown>,
   progressToken?: string | number,
-  timeoutMs = 30_000,
+  timeoutMs = DEFAULT_REQUEST_TIMEOUT_MS,
 ): Promise<unknown> {
-  if (!socket) {
-    await connectToAnton()
-  }
+  const sock = await ensureAuthed()
 
   const id = requestId++
   const forwarded: Record<string, unknown> = { ...params, _antonSession: ANTON_SESSION }
-  // Progress token is echoed back inside each `method:"progress"` frame
-  // from Anton, so we don't need to map it by request id — the server
-  // passes it through and we forward it 1:1 to the caller.
   if (progressToken !== undefined) forwarded._progressToken = progressToken
-  const request = JSON.stringify({ jsonrpc: '2.0', id, method, params: forwarded })
+  const payload = JSON.stringify({ jsonrpc: '2.0', id, method, params: forwarded })
 
-  return new Promise((resolve, reject) => {
-    pendingRequests.set(id, resolve)
-    socket!.write(`${request}\n`, (err) => {
-      if (err) {
+  return new Promise<unknown>((resolve, reject) => {
+    const timeout = setTimeout(() => {
+      if (pendingRequests.has(id)) {
         pendingRequests.delete(id)
-        reject(err)
+        reject(new Error(`request timed out after ${timeoutMs}ms (method=${method})`))
       }
+    }, timeoutMs)
+    timeout.unref?.()
+
+    pendingRequests.set(id, {
+      method,
+      sentAt: Date.now(),
+      resolve,
+      reject,
+      timeout,
     })
 
-    setTimeout(() => {
-      if (pendingRequests.has(id)) {
-        pendingRequests.delete(id)
-        reject(new Error('Request timed out'))
+    sock.write(`${payload}\n`, (err) => {
+      if (err) {
+        // Sync/async write failure — drain this one and let the
+        // transition funnel clean up the socket if it's dead.
+        const entry = pendingRequests.get(id)
+        if (entry) {
+          pendingRequests.delete(id)
+          clearTimeout(entry.timeout)
+          reject(err)
+        }
+        if (state.tag === 'authed') {
+          transitionToLost(`write error: ${err.message}`)
+        }
       }
-    }, timeoutMs)
+    })
   })
 }
 
-// ── JSON-RPC / MCP handler ──────────────────────────────────────────
+// ── MCP JSON-RPC handler (stdio) ────────────────────────────────────
 
-function sendResponse(id: string | number | null, result: unknown) {
-  const response = JSON.stringify({ jsonrpc: '2.0', id, result })
-  process.stdout.write(`${response}\n`)
+function sendResponse(id: string | number | null, result: unknown): void {
+  process.stdout.write(`${JSON.stringify({ jsonrpc: '2.0', id, result })}\n`)
 }
 
-function sendError(id: string | number | null, code: number, message: string) {
-  const response = JSON.stringify({
-    jsonrpc: '2.0',
-    id,
-    error: { code, message },
-  })
-  process.stdout.write(`${response}\n`)
+function sendError(id: string | number | null, code: number, message: string): void {
+  process.stdout.write(
+    `${JSON.stringify({ jsonrpc: '2.0', id, error: { code, message } })}\n`,
+  )
 }
 
 async function handleRequest(msg: {
   id?: string | number
   method: string
   params?: Record<string, unknown>
-}) {
+}): Promise<void> {
   const { id, method, params } = msg
 
   switch (method) {
     case 'initialize':
+      log('info', 'initialize', { clientProto: params?.protocolVersion })
       sendResponse(id ?? null, {
         protocolVersion: '2024-11-05',
         capabilities: { tools: { listChanged: false } },
-        serverInfo: { name: 'anton-mcp-shim', version: '1.0.0' },
+        serverInfo: { name: 'anton-mcp-shim', version: SHIM_VERSION },
       })
-      break
+      return
 
     case 'notifications/initialized':
-      // No response needed for notifications
-      break
+      return
 
     case 'ping':
       sendResponse(id ?? null, {})
-      break
+      return
 
     case 'tools/list': {
+      const startedAt = Date.now()
+      log('debug', 'tools/list received', { id })
       try {
         const result = await sendToAnton('tools/list', params || {})
+        const toolCount = Array.isArray((result as { tools?: unknown[] })?.tools)
+          ? (result as { tools: unknown[] }).tools.length
+          : 0
+        log('info', 'tools/list served', { id, toolCount, durationMs: Date.now() - startedAt })
         sendResponse(id ?? null, result)
       } catch (err) {
-        sendError(id ?? null, -32000, `Failed to list tools: ${(err as Error).message}`)
+        const message = (err as Error).message
+        log('error', 'tools/list failed', {
+          id,
+          error: message,
+          durationMs: Date.now() - startedAt,
+        })
+        sendError(id ?? null, -32000, `Failed to list tools: ${message}`)
       }
-      break
+      return
     }
 
     case 'tools/call': {
+      const startedAt = Date.now()
+      const toolName = (params?.name as string | undefined) ?? 'unknown'
+      // MCP spec: callers request streaming progress by setting
+      // `_meta.progressToken` on the tools/call request. We forward it
+      // to Anton as `_progressToken` on the IPC request so the server-
+      // side handler can emit progress frames bound to it. Streaming
+      // tool calls get a 30-minute budget since a research sub-agent
+      // can legitimately run many minutes; non-streaming stays at the
+      // default 30s.
+      const meta = params?._meta as { progressToken?: string | number } | undefined
+      const progressToken = meta?.progressToken
+      const timeoutMs =
+        progressToken !== undefined ? STREAMING_REQUEST_TIMEOUT_MS : DEFAULT_REQUEST_TIMEOUT_MS
+
+      log('info', 'tools/call received', {
+        id,
+        tool: toolName,
+        streaming: progressToken !== undefined,
+      })
       try {
-        // MCP spec: callers request streaming progress by setting
-        // `_meta.progressToken` on the tools/call request. We forward
-        // the token to Anton as `_progressToken` on the IPC request so
-        // the server-side handler can emit progress frames bound to it.
-        // Streaming tool calls get a 30-minute budget since a research
-        // sub-agent can legitimately run many minutes; non-streaming
-        // stays at the default 30s.
-        const meta = params?._meta as { progressToken?: string | number } | undefined
-        const progressToken = meta?.progressToken
-        const timeoutMs = progressToken !== undefined ? 30 * 60_000 : 30_000
         const result = await sendToAnton('tools/call', params || {}, progressToken, timeoutMs)
+        const isError = Boolean((result as { isError?: boolean })?.isError)
+        log(isError ? 'warn' : 'info', 'tools/call completed', {
+          id,
+          tool: toolName,
+          durationMs: Date.now() - startedAt,
+          isError,
+        })
         sendResponse(id ?? null, result)
       } catch (err) {
-        sendError(id ?? null, -32000, `Failed to call tool: ${(err as Error).message}`)
+        const message = (err as Error).message
+        log('error', 'tools/call failed', {
+          id,
+          tool: toolName,
+          error: message,
+          durationMs: Date.now() - startedAt,
+        })
+        sendError(id ?? null, -32000, `Failed to call tool ${toolName}: ${message}`)
       }
-      break
+      return
     }
 
     default:
+      log('warn', 'unknown MCP method', { method, id })
       sendError(id ?? null, -32601, `Method not found: ${method}`)
   }
 }
 
 // ── Main: read stdin JSON-RPC ───────────────────────────────────────
 
-const rl = readline.createInterface({ input: process.stdin })
+const stdinRl = readline.createInterface({ input: process.stdin })
 
-rl.on('line', (line) => {
+stdinRl.on('line', (line) => {
+  let msg: { id?: string | number; method: string; params?: Record<string, unknown> }
   try {
-    const msg = JSON.parse(line)
-    handleRequest(msg).catch((err) => {
-      process.stderr.write(`anton-mcp-shim: unhandled error: ${err.message}\n`)
-    })
+    msg = JSON.parse(line)
   } catch {
-    // Ignore malformed input
+    log('debug', 'dropped malformed stdin line', { line: line.slice(0, 200) })
+    return
+  }
+  handleRequest(msg).catch((err) => {
+    log('error', 'unhandled error in handleRequest', {
+      method: msg.method,
+      error: (err as Error).message,
+    })
+  })
+})
+
+stdinRl.on('close', () => {
+  log('info', 'stdin closed — shutting down')
+  clearPingTimer()
+  if (state.tag === 'authed') {
+    try {
+      state.socket.destroy()
+    } catch {
+      /* */
+    }
   }
+  process.exit(0)
 })
 
-// Graceful cleanup
 process.on('SIGTERM', () => {
-  socket?.destroy()
+  log('info', 'received SIGTERM')
+  clearPingTimer()
+  if (state.tag === 'authed') {
+    try {
+      state.socket.destroy()
+    } catch {
+      /* */
+    }
+  }
   process.exit(0)
 })
 
 process.on('SIGINT', () => {
-  socket?.destroy()
+  log('info', 'received SIGINT')
+  clearPingTimer()
+  if (state.tag === 'authed') {
+    try {
+      state.socket.destroy()
+    } catch {
+      /* */
+    }
+  }
   process.exit(0)
 })
+
+log('info', 'shim started', {
+  antonSock: ANTON_SOCK,
+  pid: process.pid,
+})
diff --git a/packages/agent-core/src/harness/codex-harness-session.ts b/packages/agent-core/src/harness/codex-harness-session.ts
index af2cccf..c1fc841 100644
--- a/packages/agent-core/src/harness/codex-harness-session.ts
+++ b/packages/agent-core/src/harness/codex-harness-session.ts
@@ -27,19 +27,37 @@ import { ANTON_MCP_NAMESPACE } from '../prompt-layers.js'
 import type { SessionEvent } from '../session.js'
 import { CodexRpcClient, CodexRpcError } from './codex-rpc.js'
 import { PINNED_CLI_VERSION, detectCodexCli } from './codex-version.js'
+import type { McpSpawnConfig } from './mcp-spawn-config.js'
 
 const log = createLogger('codex-harness-session')
 
-export interface CodexHarnessSessionOpts {
-  id: string
-  provider: string
-  model: string
+/**
+ * MCP bridge config passed into CodexHarnessSession. The `spawn` block is
+ * produced by `buildMcpSpawnConfig()` from agent-core (package-owned
+ * shim path + `process.execPath` — see `mcp-spawn-config.ts`). We keep
+ * this as an opaque bag so future transports (e.g. HTTP MCP) don't force
+ * a new breaking param.
+ */
+export interface CodexHarnessMcpOpts {
   /** Unix socket path for MCP IPC. */
   socketPath: string
-  /** Path to the anton-mcp-shim.js file. */
-  shimPath: string
   /** Per-session auth token presented by the shim to Anton's IPC server. */
   authToken: string
+  /** How to launch the shim subprocess — single source of truth. */
+  spawn: McpSpawnConfig
+}
+
+export interface CodexHarnessSessionOpts {
+  id: string
+  provider: string
+  model: string
+  /**
+   * MCP bridge configuration. Constructed by the server once from
+   * `buildMcpSpawnConfig()` and wrapped with the per-session auth token +
+   * socket path. Replaces the old flat `socketPath` / `shimPath` /
+   * `authToken` triple.
+   */
+  mcp: CodexHarnessMcpOpts
   /** Project workspace. Passed as `cwd` to codex and `thread/start.cwd`. */
   cwd?: string
   /** Static system prompt fallback (used if no builder provided). */
@@ -568,9 +586,9 @@ export class CodexHarnessSession {
 
   private buildConfig(): Record<string, unknown> {
     const mcpEnv: Record<string, string> = {
-      ANTON_SOCK: this.opts.socketPath,
+      ANTON_SOCK: this.opts.mcp.socketPath,
       ANTON_SESSION: this.id,
-      ANTON_AUTH: this.opts.authToken,
+      ANTON_AUTH: this.opts.mcp.authToken,
     }
     // The namespace Codex uses to prefix our tools (e.g. `anton:gmail_*`)
     // MUST match the value the identity + capability blocks reference.
@@ -578,8 +596,8 @@ export class CodexHarnessSession {
       model_reasoning_summary: 'detailed',
       mcp_servers: {
         [ANTON_MCP_NAMESPACE]: {
-          command: 'node',
-          args: [this.opts.shimPath],
+          command: this.opts.mcp.spawn.command,
+          args: this.opts.mcp.spawn.args,
           env: mcpEnv,
         },
       },
diff --git a/packages/agent-core/src/harness/harness-session.ts b/packages/agent-core/src/harness/harness-session.ts
index 3cfcbee..e47cbac 100644
--- a/packages/agent-core/src/harness/harness-session.ts
+++ b/packages/agent-core/src/harness/harness-session.ts
@@ -16,6 +16,18 @@ import { createLogger } from '@anton/logger'
 import type { ChatImageAttachmentInput } from '@anton/protocol'
 import type { SessionEvent } from '../session.js'
 import type { HarnessAdapter } from './adapter.js'
+import type { McpSpawnConfig } from './mcp-spawn-config.js'
+
+/**
+ * MCP bridge config for HarnessSession. Mirrors CodexHarnessMcpOpts; we
+ * keep the types separate so the two paths can diverge later (e.g. HTTP
+ * MCP for Codex only) without coupling.
+ */
+export interface HarnessMcpOpts {
+  socketPath: string
+  authToken: string
+  spawn: McpSpawnConfig
+}
 
 const log = createLogger('harness-session')
 
@@ -24,10 +36,13 @@ export interface HarnessSessionOpts {
   provider: string
   model: string
   adapter: HarnessAdapter
-  socketPath: string
-  shimPath: string
-  /** Per-session auth token presented by the shim to Anton's IPC server. */
-  authToken: string
+  /**
+   * MCP bridge configuration. Constructed once by the server from
+   * `buildMcpSpawnConfig()` and wrapped with the per-session auth token +
+   * socket path. Replaces the old flat `socketPath` / `shimPath` /
+   * `authToken` fields.
+   */
+  mcp: HarnessMcpOpts
   cwd?: string
   /**
    * Static system prompt. Kept for back-compat / test paths that don't
@@ -64,9 +79,7 @@ export class HarnessSession {
   readonly createdAt: number
 
   private adapter: HarnessAdapter
-  private socketPath: string
-  private shimPath: string
-  private authToken: string
+  private mcp: HarnessMcpOpts
   private cwd?: string
   private systemPrompt?: string
   private buildSystemPromptFn?: (userMessage: string, turnIndex: number) => Promise<string>
@@ -98,9 +111,7 @@ export class HarnessSession {
     this.provider = opts.provider
     this.model = opts.model
     this.adapter = opts.adapter
-    this.socketPath = opts.socketPath
-    this.shimPath = opts.shimPath
-    this.authToken = opts.authToken
+    this.mcp = opts.mcp
     this.cwd = opts.cwd
     this.systemPrompt = opts.systemPrompt
     this.buildSystemPromptFn = opts.buildSystemPrompt
@@ -174,18 +185,18 @@ export class HarnessSession {
         systemPrompt: systemPromptForTurn,
         maxBudgetUsd: this.maxBudgetUsd,
         cwd: this.cwd,
-        shimPath: this.shimPath,
-        socketPath: this.socketPath,
+        shimPath: this.mcp.spawn.shimPath,
+        socketPath: this.mcp.socketPath,
         sessionId: this.id,
-        authToken: this.authToken,
+        authToken: this.mcp.authToken,
       })
 
       const env = {
         ...process.env,
         ...this.adapter.buildEnv({
-          socketPath: this.socketPath,
+          socketPath: this.mcp.socketPath,
           sessionId: this.id,
-          authToken: this.authToken,
+          authToken: this.mcp.authToken,
         }),
       }
 
@@ -447,12 +458,12 @@ export class HarnessSession {
     const config = {
       mcpServers: {
         anton: {
-          command: 'node',
-          args: [this.shimPath],
+          command: this.mcp.spawn.command,
+          args: this.mcp.spawn.args,
           env: {
-            ANTON_SOCK: this.socketPath,
+            ANTON_SOCK: this.mcp.socketPath,
             ANTON_SESSION: this.id,
-            ANTON_AUTH: this.authToken,
+            ANTON_AUTH: this.mcp.authToken,
           },
         },
       },
diff --git a/packages/agent-core/src/harness/index.ts b/packages/agent-core/src/harness/index.ts
index 6c108e2..6533f97 100644
--- a/packages/agent-core/src/harness/index.ts
+++ b/packages/agent-core/src/harness/index.ts
@@ -3,11 +3,26 @@ export { ClaudeAdapter } from './adapters/claude.js'
 export { CodexAdapter } from './adapters/codex.js'
 export type { ClaudeStreamEvent } from './claude-events.js'
 export type { CodexStreamEvent } from './codex-events.js'
-export { HarnessSession, isHarnessSession, type HarnessSessionOpts } from './harness-session.js'
+export {
+  HarnessSession,
+  isHarnessSession,
+  type HarnessSessionOpts,
+  type HarnessMcpOpts,
+} from './harness-session.js'
 export {
   CodexHarnessSession,
   type CodexHarnessSessionOpts,
+  type CodexHarnessMcpOpts,
 } from './codex-harness-session.js'
+export {
+  buildMcpSpawnConfig,
+  getExpectedShimVersion,
+  probeMcpShim,
+  type McpSpawnConfig,
+  type ShimProbeResult,
+  type ShimProbeOk,
+  type ShimProbeErr,
+} from './mcp-spawn-config.js'
 export type { CodexRpcClient, CodexRpcError } from './codex-rpc.js'
 export { PINNED_CLI_VERSION, MIN_SUPPORTED_CLI_VERSION, detectCodexCli } from './codex-version.js'
 export {
diff --git a/packages/agent-core/src/harness/mcp-ipc-handler.ts b/packages/agent-core/src/harness/mcp-ipc-handler.ts
index 12c1dd6..913357a 100644
--- a/packages/agent-core/src/harness/mcp-ipc-handler.ts
+++ b/packages/agent-core/src/harness/mcp-ipc-handler.ts
@@ -5,19 +5,38 @@
  * Each harness session spawns its own MCP shim, which connects here to
  * access Anton's tool implementations.
  *
- * Auth model: every connection must present an {sessionId, token} pair as
- * its first message. Tokens are registered by the server before spawning
- * the CLI. Once authed, the connection is bound to that sessionId for its
- * entire lifetime — any subsequent tools/call whose params claim a
- * different `_antonSession` is rejected.
+ * Auth model: every connection must present an {sessionId, token} pair
+ * as its first message. Tokens are registered by the server before
+ * spawning the CLI. Once authed, the connection is bound to that
+ * sessionId for its entire lifetime — any subsequent tools/call whose
+ * params claim a different `_antonSession` is rejected.
+ *
+ * Lifecycle contract with `anton-mcp-shim.ts`:
+ *   - Server sends `{method: 'bye', params: {reason}}` as a notification
+ *     when a session is unregistered (evicted / destroyed). The shim
+ *     transitions its own state to `lost`, drains pending requests, and
+ *     reconnects on the next call if/when the session id is re-
+ *     registered.
+ *   - Shim may send `{method: 'log', params: {level, msg, fields}}` as
+ *     a notification to surface its structured logs with session
+ *     context — the server re-logs them under the `mcp-shim` module
+ *     with sessionId attached.
+ *   - Shim periodically sends `{method: 'ping', id: N}` to detect
+ *     half-close. We reply `{}` immediately.
+ *
+ * Everything above auth is tolerant of connection loss: if either side
+ * disappears, the other releases its bookkeeping (authed connection set
+ * on the server, pending-request map on the shim) within one event
+ * loop turn.
  */
 
 import { existsSync, unlinkSync } from 'node:fs'
 import * as net from 'node:net'
 import { createInterface } from 'node:readline'
-import { createLogger } from '@anton/logger'
+import { createLogger, type Logger } from '@anton/logger'
 
 const log = createLogger('mcp-ipc')
+const shimLog = createLogger('mcp-shim')
 
 export interface McpToolSchema {
   name: string
@@ -33,10 +52,10 @@ export interface McpToolResult {
 /**
  * Callback passed into `executeTool` when the MCP caller requested
  * progress updates (MCP's `_meta.progressToken` → our IPC
- * `_progressToken`). Tools call it zero or more times during
- * execution to surface intermediate status; the handler forwards each
- * call back over the IPC connection as a `method: "progress"` frame,
- * which the shim then emits as `notifications/progress` to the caller.
+ * `_progressToken`). Tools call it zero or more times during execution
+ * to surface intermediate status; the handler forwards each call back
+ * over the IPC connection as a `method: "progress"` frame, which the
+ * shim then emits as `notifications/progress` to the caller.
  */
 export type ProgressCallback = (message: string, progress?: number) => void
 
@@ -53,17 +72,32 @@ export interface IpcToolProvider {
 export interface McpIpcServer {
   readonly server: net.Server
   registerSession(sessionId: string, token: string): void
-  unregisterSession(sessionId: string): void
+  /**
+   * Unregister a session. Sends a `bye` notification to every authed
+   * shim connection bound to this session, then half-closes each. The
+   * shim treats the bye as a clean "server is done with me" and
+   * transitions out of `authed` — any in-flight call fails with
+   * `ipc disconnected: bye: …` instead of hanging.
+   */
+  unregisterSession(sessionId: string, reason?: string): void
+  /** Current connection snapshot — diagnostics only. */
+  debugStats(): { authedSessions: number; totalConns: number }
   close(): Promise<void>
 }
 
 interface JsonRpcRequest {
   jsonrpc: string
-  id: string | number
+  id?: string | number
   method: string
   params?: Record<string, unknown> & { _antonSession?: string }
 }
 
+interface ShimLogParams {
+  level?: 'debug' | 'info' | 'warn' | 'error'
+  msg?: string
+  fields?: Record<string, unknown>
+}
+
 /** Time a connection has to present its auth frame before being dropped. */
 const AUTH_TIMEOUT_MS = 5_000
 
@@ -72,19 +106,22 @@ const AUTH_TIMEOUT_MS = 5_000
  * from harness shim processes.
  */
 export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider): McpIpcServer {
-  // Clean up stale socket file
   if (existsSync(socketPath)) {
     try {
       unlinkSync(socketPath)
     } catch {
-      // Ignore
+      /* stale socket; ignore */
     }
   }
 
   // sessionId → expected token
   const registeredSessions = new Map<string, string>()
+  // sessionId → authed connections bound to it. Tracked so
+  // `unregisterSession` can proactively bye + half-close each one.
+  const authedConnections = new Map<string, Set<net.Socket>>()
 
   const server = net.createServer((conn) => {
+    const connStartedAt = Date.now()
     log.debug('MCP shim connected')
 
     /** Set to the sessionId once the connection successfully authenticates. */
@@ -97,7 +134,7 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
         try {
           sendError(conn, null, -32001, 'unauthenticated: auth timeout')
         } catch {
-          // connection may already be dead
+          /* connection may already be dead */
         }
         conn.destroy()
       }
@@ -105,7 +142,23 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
 
     const rl = createInterface({ input: conn })
 
-    rl.on('line', async (line) => {
+    rl.on('line', (line) => {
+      // Outer boundary — any throw from the async body (including a
+      // sync throw in `sendError`/`sendResponse` when the peer socket
+      // is already destroyed) must not escape as an unhandledRejection.
+      void (async () => {
+        try {
+          await processLine(line)
+        } catch (err) {
+          log.error(
+            { err: (err as Error).message, sessionId: authedSessionId },
+            'IPC line handler threw — dropping frame',
+          )
+        }
+      })()
+    })
+
+    const processLine = async (line: string): Promise<void> => {
       let request: JsonRpcRequest
       try {
         request = JSON.parse(line)
@@ -116,7 +169,7 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
       // First frame must be auth
       if (!authedSessionId) {
         if (request.method !== 'auth') {
-          sendError(conn, request.id, -32001, 'unauthenticated: auth frame required first')
+          sendError(conn, request.id ?? null, -32001, 'unauthenticated: auth frame required first')
           conn.destroy()
           clearTimeout(authTimer)
           return
@@ -126,7 +179,12 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
         const claimedToken = request.params?.token as string | undefined
 
         if (!claimedSession || !claimedToken) {
-          sendError(conn, request.id, -32001, 'unauthenticated: missing sessionId or token')
+          sendError(
+            conn,
+            request.id ?? null,
+            -32001,
+            'unauthenticated: missing sessionId or token',
+          )
           conn.destroy()
           clearTimeout(authTimer)
           return
@@ -135,7 +193,12 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
         const expected = registeredSessions.get(claimedSession)
         if (!expected || expected !== claimedToken) {
           log.warn({ claimedSession }, 'MCP shim auth rejected: bad token or unknown session')
-          sendError(conn, request.id, -32001, 'unauthenticated: bad token or unknown session')
+          sendError(
+            conn,
+            request.id ?? null,
+            -32001,
+            'unauthenticated: bad token or unknown session',
+          )
           conn.destroy()
           clearTimeout(authTimer)
           return
@@ -143,8 +206,33 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
 
         authedSessionId = claimedSession
         clearTimeout(authTimer)
-        sendResponse(conn, request.id, { ok: true })
-        log.debug({ sessionId: authedSessionId }, 'MCP shim authenticated')
+
+        let set = authedConnections.get(authedSessionId)
+        if (!set) {
+          set = new Set()
+          authedConnections.set(authedSessionId, set)
+        }
+        set.add(conn)
+
+        sendResponse(conn, request.id ?? null, { ok: true })
+        log.info(
+          { sessionId: authedSessionId, handshakeMs: Date.now() - connStartedAt },
+          'MCP shim authenticated',
+        )
+        return
+      }
+
+      // Notifications (no id) from shim — handle before the request switch.
+      if (request.id === undefined || request.id === null) {
+        if (request.method === 'log') {
+          handleShimLog(authedSessionId, request.params as ShimLogParams | undefined)
+          return
+        }
+        // Unknown notification — log once at debug so we can see it.
+        log.debug(
+          { sessionId: authedSessionId, method: request.method },
+          'unknown shim notification',
+        )
         return
       }
 
@@ -165,6 +253,12 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
         let result: unknown
 
         switch (request.method) {
+          case 'ping': {
+            // Shim's liveness probe. Reply fast, no bookkeeping.
+            result = {}
+            break
+          }
+
           case 'tools/list': {
             const tools = provider.getTools(sessionId)
             log.info(
@@ -195,10 +289,19 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
               return
             }
 
+            log.info(
+              {
+                sessionId,
+                tool: toolName,
+                streaming: progressToken !== undefined,
+                requestId: request.id,
+              },
+              'tools/call received',
+            )
+
+            const callStartedAt = Date.now()
+
             // Build the progress callback only if the caller opted in.
-            // Streaming tools call this during execution; non-streaming
-            // tools ignore it. Failures to write are swallowed — the
-            // final response still gets sent over the same connection.
             const onProgress: ProgressCallback | undefined = progressToken
               ? (message: string, progress?: number) => {
                   try {
@@ -217,12 +320,31 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
                 }
               : undefined
 
-            const toolResult = await provider.executeTool(sessionId, toolName, toolArgs, onProgress)
+            const toolResult = await provider.executeTool(
+              sessionId,
+              toolName,
+              toolArgs,
+              onProgress,
+            )
+            log.info(
+              {
+                sessionId,
+                tool: toolName,
+                durationMs: Date.now() - callStartedAt,
+                isError: Boolean(toolResult.isError),
+                requestId: request.id,
+              },
+              'tools/call completed',
+            )
             result = toolResult
             break
           }
 
           default:
+            log.warn(
+              { sessionId, method: request.method, id: request.id },
+              'unknown MCP method from shim',
+            )
             sendError(conn, request.id, -32601, `Unknown method: ${request.method}`)
             return
         }
@@ -232,14 +354,29 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
         log.error({ err, method: request.method, sessionId }, 'IPC request failed')
         sendError(conn, request.id, -32000, (err as Error).message)
       }
-    })
+    }
 
-    conn.on('close', () => {
+    conn.on('close', (hadError) => {
       clearTimeout(authTimer)
+      if (authedSessionId) {
+        const set = authedConnections.get(authedSessionId)
+        if (set) {
+          set.delete(conn)
+          if (set.size === 0) authedConnections.delete(authedSessionId)
+        }
+        log.info(
+          {
+            sessionId: authedSessionId,
+            livedMs: Date.now() - connStartedAt,
+            hadError,
+          },
+          'MCP shim disconnected',
+        )
+      }
     })
 
     conn.on('error', (err) => {
-      log.debug({ err: err.message }, 'MCP shim connection error')
+      log.debug({ err: err.message, sessionId: authedSessionId }, 'MCP shim connection error')
     })
   })
 
@@ -255,25 +392,138 @@ export function createMcpIpcServer(socketPath: string, provider: IpcToolProvider
     server,
     registerSession(sessionId: string, token: string) {
       registeredSessions.set(sessionId, token)
+      log.debug({ sessionId }, 'session auth registered')
     },
-    unregisterSession(sessionId: string) {
+    unregisterSession(sessionId: string, reason = 'session_unregistered') {
       registeredSessions.delete(sessionId)
+      const set = authedConnections.get(sessionId)
+      if (!set || set.size === 0) {
+        log.debug({ sessionId, reason }, 'session unregistered (no live conn)')
+        return
+      }
+      const byeFrame = JSON.stringify({
+        jsonrpc: '2.0',
+        method: 'bye',
+        params: { reason },
+      })
+      let sent = 0
+      for (const conn of set) {
+        try {
+          conn.write(`${byeFrame}\n`)
+          sent += 1
+          // Half-close — gives the shim a chance to flush its own writes
+          // (including drain-on-lost log notifications) before the TCP/
+          // UDS RST. The shim will destroy its side on seeing `bye`.
+          conn.end()
+        } catch (err) {
+          log.debug(
+            { err: (err as Error).message, sessionId },
+            'write bye failed — shim likely already gone',
+          )
+        }
+      }
+      authedConnections.delete(sessionId)
+      log.info({ sessionId, reason, byeSent: sent }, 'session unregistered')
+    },
+    debugStats() {
+      let totalConns = 0
+      for (const set of authedConnections.values()) totalConns += set.size
+      return {
+        authedSessions: authedConnections.size,
+        totalConns,
+      }
     },
     close() {
       return new Promise<void>((resolve) => {
         registeredSessions.clear()
+        // Best-effort bye to every authed conn before we stop the server,
+        // so shims see a clean reason instead of a raw RST on shutdown.
+        const byeFrame = JSON.stringify({
+          jsonrpc: '2.0',
+          method: 'bye',
+          params: { reason: 'server_shutdown' },
+        })
+        for (const [sessionId, set] of authedConnections) {
+          for (const conn of set) {
+            try {
+              conn.write(`${byeFrame}\n`)
+              conn.end()
+            } catch {
+              /* already gone */
+            }
+          }
+          log.debug({ sessionId }, 'bye sent on server close')
+        }
+        authedConnections.clear()
         server.close(() => resolve())
       })
     },
   }
 }
 
-function sendResponse(conn: net.Socket, id: string | number | null, result: unknown) {
-  const msg = JSON.stringify({ jsonrpc: '2.0', id, result })
-  conn.write(`${msg}\n`)
+/**
+ * Fan a shim-sourced log event into our normal logger. Clamps the level
+ * to something known, stamps sessionId into the record so filtering
+ * works, and never throws — a malformed log frame shouldn't tear down
+ * the IPC connection.
+ */
+function handleShimLog(sessionId: string, params: ShimLogParams | undefined): void {
+  if (!params) return
+  const level = params.level ?? 'info'
+  const msg = typeof params.msg === 'string' ? params.msg : '(shim log)'
+  const fields = (params.fields ?? {}) as Record<string, unknown>
+  const fn = pickLevel(shimLog, level)
+  try {
+    fn.call(shimLog, { sessionId, ...fields }, msg)
+  } catch {
+    /* logger should never throw, but if it does, don't cascade */
+  }
+}
+
+function pickLevel(
+  logger: Logger,
+  level: 'debug' | 'info' | 'warn' | 'error',
+): Logger['info'] {
+  switch (level) {
+    case 'debug':
+      return logger.debug.bind(logger)
+    case 'warn':
+      return logger.warn.bind(logger)
+    case 'error':
+      return logger.error.bind(logger)
+    default:
+      return logger.info.bind(logger)
+  }
+}
+
+/**
+ * Write a JSON-RPC frame to a connection. Swallows errors because the
+ * write is meaningful only if the peer is still listening — if the shim
+ * hung up, we have no one to report to, and surfacing the throw through
+ * the async line-handler would turn a dead connection into an
+ * unhandledRejection crash.
+ */
+function safeWrite(conn: net.Socket, frame: string): void {
+  if (conn.destroyed || conn.writableEnded) return
+  try {
+    conn.write(`${frame}\n`)
+  } catch (err) {
+    log.debug(
+      { err: (err as Error).message },
+      'IPC frame write failed — peer likely gone',
+    )
+  }
+}
+
+function sendResponse(conn: net.Socket, id: string | number | null, result: unknown): void {
+  safeWrite(conn, JSON.stringify({ jsonrpc: '2.0', id, result }))
 }
 
-function sendError(conn: net.Socket, id: string | number | null, code: number, message: string) {
-  const msg = JSON.stringify({ jsonrpc: '2.0', id, error: { code, message } })
-  conn.write(`${msg}\n`)
+function sendError(
+  conn: net.Socket,
+  id: string | number | null,
+  code: number,
+  message: string,
+): void {
+  safeWrite(conn, JSON.stringify({ jsonrpc: '2.0', id, error: { code, message } }))
 }
diff --git a/packages/agent-core/src/harness/mcp-spawn-config.ts b/packages/agent-core/src/harness/mcp-spawn-config.ts
new file mode 100644
index 0000000..9647919
--- /dev/null
+++ b/packages/agent-core/src/harness/mcp-spawn-config.ts
@@ -0,0 +1,291 @@
+/**
+ * MCP shim spawn config — single source of truth for how to launch
+ * `anton-mcp-shim.js` as a subprocess of `codex app-server`.
+ *
+ * The shim path is resolved from this module's own `import.meta.url`,
+ * so it stays correct regardless of where the host process was started
+ * or how `@anton/agent-core` is installed (monorepo workspace, `pnpm
+ * deploy`, `rsync` to `/opt/anton`, etc.). The previous implementation
+ * composed the path from `homedir() + '../node_modules/...'`, which
+ * broke on VPS deployments where the server runs as user `anton` and
+ * HOME resolves to `/home/anton` but the actual install sits at
+ * `/opt/anton`.
+ *
+ * We also prefer `process.execPath` over the literal `'node'` — systemd
+ * services don't inherit PATH reliably, and `execPath` is guaranteed to
+ * be the node binary currently running the server.
+ */
+
+import { spawn } from 'node:child_process'
+import { readFileSync } from 'node:fs'
+import { dirname } from 'node:path'
+import { fileURLToPath } from 'node:url'
+import { createLogger } from '@anton/logger'
+
+const log = createLogger('mcp-spawn')
+
+/** Absolute path to `anton-mcp-shim.js` next to this module on disk. */
+const SHIM_PATH = fileURLToPath(new URL('./anton-mcp-shim.js', import.meta.url))
+
+/**
+ * Version of `@anton/agent-core` this shim ships with. Read once at module
+ * load from the nearest `package.json`; used in the initialize handshake
+ * so the server can log version skew on startup (e.g. if the shim on disk
+ * is older than the host binary because `make sync` raced a partial
+ * deploy).
+ */
+const PACKAGE_VERSION = (() => {
+  try {
+    const pkgPath = fileURLToPath(new URL('../../package.json', import.meta.url))
+    const raw = readFileSync(pkgPath, 'utf8')
+    const { version } = JSON.parse(raw) as { version?: string }
+    return typeof version === 'string' ? version : 'unknown'
+  } catch {
+    return 'unknown'
+  }
+})()
+
+export interface McpSpawnConfig {
+  /** Absolute path to the node binary to invoke. */
+  command: string
+  /** Argv after the command — shim path only; callers pass env separately. */
+  args: string[]
+  /**
+   * Absolute path to the shim on disk. Included for diagnostics — callers
+   * shouldn't need to touch it. Useful when surfacing "shim not found"
+   * errors or when printing the effective config.
+   */
+  shimPath: string
+}
+
+/**
+ * Build the spawn config. Cheap and stateless — call per-session or cache
+ * at the server level, behavior is identical either way.
+ */
+export function buildMcpSpawnConfig(): McpSpawnConfig {
+  return {
+    command: process.execPath,
+    args: [SHIM_PATH],
+    shimPath: SHIM_PATH,
+  }
+}
+
+/** Expected version reported by `anton-mcp-shim` during `initialize`. */
+export function getExpectedShimVersion(): string {
+  return PACKAGE_VERSION
+}
+
+export type ShimProbeOk = {
+  ok: true
+  version: string
+  protocolVersion: string
+  serverName: string
+  durationMs: number
+}
+export type ShimProbeErr = {
+  ok: false
+  error: string
+  stderrTail: string[]
+  durationMs: number
+}
+export type ShimProbeResult = ShimProbeOk | ShimProbeErr
+
+/**
+ * Health probe: spawn the shim in isolation, complete one `initialize`
+ * round-trip, tear it down. Returns within `timeoutMs` either way.
+ *
+ * This is the gate the server uses on boot (and every 60s) to decide
+ * whether to advertise MCP connectors to harness sessions. If the probe
+ * fails, we log the stderr tail and *omit* the capability block —
+ * harness CLIs won't hallucinate a connector that the shim can't reach.
+ *
+ * The probe uses ephemeral `ANTON_SESSION` / `ANTON_AUTH` values that
+ * aren't registered with the IPC server. The shim is permitted to reach
+ * `initialize` before it attempts to connect to the IPC socket, so the
+ * probe confirms:
+ *   1. The shim binary exists at the expected path.
+ *   2. Node can load it without syntax errors or missing imports.
+ *   3. JSON-RPC framing works end to end.
+ *   4. Reported version matches the one we shipped.
+ *
+ * It does NOT verify the IPC auth path — that's exercised by every live
+ * session as soon as a tool is called.
+ */
+export async function probeMcpShim(
+  spawnConfig: McpSpawnConfig = buildMcpSpawnConfig(),
+  timeoutMs = 5_000,
+): Promise<ShimProbeResult> {
+  const start = Date.now()
+  const stderrTail: string[] = []
+  const STDERR_TAIL_MAX = 20
+
+  return new Promise<ShimProbeResult>((resolve) => {
+    let settled = false
+    const done = (result: ShimProbeResult) => {
+      if (settled) return
+      settled = true
+      resolve(result)
+    }
+
+    const child = spawn(spawnConfig.command, spawnConfig.args, {
+      stdio: ['pipe', 'pipe', 'pipe'],
+      env: {
+        ...process.env,
+        // Deliberate dummy values — enough to pass the shim's env check.
+        // The probe completes before the shim tries to connect to the
+        // socket, so these never hit the wire.
+        ANTON_SOCK: process.env.ANTON_SOCK ?? '/tmp/anton-shim-probe.invalid',
+        ANTON_SESSION: '__probe__',
+        ANTON_AUTH: '__probe__',
+      },
+    })
+
+    child.on('error', (err) => {
+      done({
+        ok: false,
+        error: `spawn failed: ${err.message}`,
+        stderrTail: [],
+        durationMs: Date.now() - start,
+      })
+    })
+
+    child.stderr?.on('data', (chunk: Buffer) => {
+      const lines = chunk.toString('utf8').split(/\r?\n/).filter(Boolean)
+      for (const line of lines) {
+        stderrTail.push(line)
+        if (stderrTail.length > STDERR_TAIL_MAX) stderrTail.shift()
+      }
+    })
+
+    let buffer = ''
+    child.stdout?.on('data', (chunk: Buffer) => {
+      buffer += chunk.toString('utf8')
+      let newlineIdx = buffer.indexOf('\n')
+      while (newlineIdx !== -1) {
+        const line = buffer.slice(0, newlineIdx).trim()
+        buffer = buffer.slice(newlineIdx + 1)
+        newlineIdx = buffer.indexOf('\n')
+        if (!line) continue
+
+        try {
+          const msg = JSON.parse(line) as {
+            id?: number
+            result?: {
+              serverInfo?: { name?: string; version?: string }
+              protocolVersion?: string
+            }
+          }
+          if (msg.id === 1 && msg.result) {
+            const serverInfo = msg.result.serverInfo ?? {}
+            done({
+              ok: true,
+              version: serverInfo.version ?? 'unknown',
+              protocolVersion: msg.result.protocolVersion ?? 'unknown',
+              serverName: serverInfo.name ?? 'unknown',
+              durationMs: Date.now() - start,
+            })
+            try {
+              child.kill('SIGTERM')
+            } catch {
+              /* best-effort */
+            }
+            return
+          }
+        } catch {
+          // Non-JSON stdout is a real problem for MCP — servers that log
+          // to stdout corrupt the frame. We don't fail the probe on it,
+          // but it surfaces in stderrTail on mismatch.
+        }
+      }
+    })
+
+    const timer = setTimeout(() => {
+      done({
+        ok: false,
+        error: `probe timed out after ${timeoutMs}ms`,
+        stderrTail: [...stderrTail],
+        durationMs: Date.now() - start,
+      })
+      try {
+        child.kill('SIGKILL')
+      } catch {
+        /* best-effort */
+      }
+    }, timeoutMs)
+
+    child.on('exit', (code, signal) => {
+      clearTimeout(timer)
+      if (!settled) {
+        done({
+          ok: false,
+          error: `shim exited before initialize completed (code=${code}, signal=${signal})`,
+          stderrTail: [...stderrTail],
+          durationMs: Date.now() - start,
+        })
+      }
+    })
+
+    // Fire the initialize request.
+    const initMsg = `${JSON.stringify({
+      jsonrpc: '2.0',
+      id: 1,
+      method: 'initialize',
+      params: {
+        protocolVersion: '2024-11-05',
+        capabilities: {},
+        clientInfo: { name: 'anton-probe', version: PACKAGE_VERSION },
+      },
+    })}\n`
+    // Route async stdin errors (EPIPE if the shim closes stdin after write
+    // returned synchronously) through `done` so they can't surface as an
+    // unhandled 'error' event on the stream.
+    child.stdin?.on('error', (err) => {
+      done({
+        ok: false,
+        error: `stdin error: ${(err as Error).message}`,
+        stderrTail: [...stderrTail],
+        durationMs: Date.now() - start,
+      })
+    })
+    try {
+      child.stdin?.write(initMsg)
+    } catch (err) {
+      done({
+        ok: false,
+        error: `stdin write failed: ${(err as Error).message}`,
+        stderrTail: [...stderrTail],
+        durationMs: Date.now() - start,
+      })
+    }
+  }).then((result) => {
+    if (result.ok) {
+      log.info(
+        {
+          shimPath: spawnConfig.shimPath,
+          version: result.version,
+          protocolVersion: result.protocolVersion,
+          durationMs: result.durationMs,
+        },
+        'MCP shim probe ok',
+      )
+      if (result.version !== PACKAGE_VERSION) {
+        log.warn(
+          { expected: PACKAGE_VERSION, got: result.version },
+          'MCP shim version mismatch — likely a partial deploy',
+        )
+      }
+    } else {
+      log.error(
+        {
+          shimPath: spawnConfig.shimPath,
+          error: result.error,
+          stderrTail: result.stderrTail,
+          durationMs: result.durationMs,
+          shimDir: dirname(spawnConfig.shimPath),
+        },
+        'MCP shim probe failed',
+      )
+    }
+    return result
+  })
+}
diff --git a/packages/agent-core/src/index.ts b/packages/agent-core/src/index.ts
index dcb282d..876cf14 100644
--- a/packages/agent-core/src/index.ts
+++ b/packages/agent-core/src/index.ts
@@ -73,6 +73,8 @@ export {
   HarnessSession,
   CodexHarnessSession,
   type CodexHarnessSessionOpts,
+  type CodexHarnessMcpOpts,
+  type HarnessMcpOpts,
   isHarnessSession,
   type HarnessSessionOpts,
   createMcpIpcServer,
@@ -95,5 +97,20 @@ export {
   readHarnessHistory,
   buildReplaySeed,
   extractHarnessMemoriesFromMirror,
+  buildMcpSpawnConfig,
+  getExpectedShimVersion,
+  probeMcpShim,
+  type McpSpawnConfig,
+  type ShimProbeResult,
+  type ShimProbeOk,
+  type ShimProbeErr,
 } from './harness/index.js'
 export { resolveModel } from './session.js'
+export {
+  SessionRegistry,
+  DEFAULT_POOLS,
+  type SessionCategory,
+  type PoolConfig,
+  type SessionRegistryOpts,
+  type Shutdownable,
+} from './session-registry.js'
diff --git a/packages/agent-core/src/session-registry.ts b/packages/agent-core/src/session-registry.ts
new file mode 100644
index 0000000..57a5fb0
--- /dev/null
+++ b/packages/agent-core/src/session-registry.ts
@@ -0,0 +1,335 @@
+/**
+ * SessionRegistry — a bounded, LRU-ordered registry for every live
+ * session object the server holds.
+ *
+ * Pre-registry, `server.ts` stored sessions in an unbounded `Map` and
+ * relied on explicit `session_destroy` messages from the client to clean
+ * up. Two problems fell out of that:
+ *
+ *   1. `handleSessionDestroy` deleted the Map entry but never awaited
+ *      `session.shutdown()`, so the codex app-server subprocess (and its
+ *      MCP shim child) lived until the host process died. Over hours of
+ *      normal use a VPS would accumulate tens of orphaned node + codex
+ *      processes.
+ *
+ *   2. Nothing ever evicted sessions the client forgot about. Crashed
+ *      browsers, abandoned tabs, or simply long-running desktops would
+ *      keep the in-memory set growing without bound — each harness
+ *      session costing ~30 MB RSS between codex + shim + our own state.
+ *
+ * The registry solves both with the same contract:
+ *   - `put(id, session, category)` inserts, evicting the least-recently
+ *     used entry in the same pool if capacity is reached.
+ *   - `get(id)` touches recency; `peek(id)` doesn't.
+ *   - `delete(id)` always awaits `session.shutdown()` when present.
+ *   - `pin(id)` / `unpin(id)` mark active turns unevictable for the
+ *     window they're streaming.
+ *
+ * Pools are partitioned so a burst of routines can't evict live
+ * conversations (and vice-versa). Each category has an independent
+ * capacity and recency floor.
+ *
+ * This file has no dependency on `Session` or `HarnessSession` —
+ * callers pass an opaque `Shutdownable` so agent-core stays free to
+ * introduce new session types without touching the registry.
+ */
+
+import { createLogger } from '@anton/logger'
+
+const log = createLogger('session-registry')
+
+/**
+ * The subset of the session API the registry cares about. Anything that
+ * owns resources we need to free on eviction implements the optional
+ * `shutdown()`. Pi SDK `Session` doesn't today; harness sessions do.
+ */
+export interface Shutdownable {
+  readonly id: string
+  shutdown?: () => Promise<void> | void
+}
+
+/**
+ * Why a session is in the registry. Category is a required discriminator
+ * — the caller states intent at `put()` time rather than the registry
+ * inferring it. This keeps eviction policy explicit and testable.
+ *
+ * - `conversation`: user-driven chat session (Pi SDK or harness).
+ * - `routine`: agent-manager routine / scheduled job run.
+ * - `ephemeral`: sub-agent spawn, publish job, fork — short-lived,
+ *   typically destroyed by its own finally-block before eviction
+ *   matters.
+ */
+export type SessionCategory = 'conversation' | 'routine' | 'ephemeral'
+
+export interface PoolConfig {
+  maxSessions: number
+  /**
+   * Minimum age (in ms) a session must have to be eligible for eviction.
+   * Prevents thrashing when the pool is at capacity and a new entry
+   * arrives in the same breath as the last one. 0 disables the floor.
+   */
+  recencyFloorMs: number
+}
+
+export const DEFAULT_POOLS: Record<SessionCategory, PoolConfig> = {
+  conversation: { maxSessions: 40, recencyFloorMs: 30_000 },
+  routine: { maxSessions: 40, recencyFloorMs: 30_000 },
+  ephemeral: { maxSessions: 20, recencyFloorMs: 10_000 },
+}
+
+interface Entry<T extends Shutdownable> {
+  session: T
+  category: SessionCategory
+  lastAccess: number
+  pinned: boolean
+}
+
+export interface SessionRegistryOpts<T extends Shutdownable = Shutdownable> {
+  pools?: Partial<Record<SessionCategory, PoolConfig>>
+  /**
+   * Called when an entry is removed by LRU eviction (NOT by explicit
+   * `delete()` or `shutdownAll()`). The registry always runs
+   * `session.shutdown()` on the evicted entry itself; `onEvict` is the
+   * hook for callers that hold *external* bookkeeping keyed on the
+   * session id and need to clean it up symmetrically — e.g. the server's
+   * mcp IPC auth map, harness context map, activeTurns set. Fire-and-
+   * forget: a throw or rejected promise is caught and logged.
+   *
+   * Not called on `delete()` because explicit-delete callers already own
+   * the full cleanup path (see `handleSessionDestroy` in server.ts).
+   */
+  onEvict?: (id: string, session: T) => void | Promise<void>
+}
+
+export class SessionRegistry<T extends Shutdownable = Shutdownable> {
+  private readonly entries = new Map<string, Entry<T>>()
+  private readonly pools: Record<SessionCategory, PoolConfig>
+  private readonly onEvict?: (id: string, session: T) => void | Promise<void>
+
+  constructor(opts: SessionRegistryOpts<T> = {}) {
+    this.pools = {
+      ...DEFAULT_POOLS,
+      ...opts.pools,
+    }
+    this.onEvict = opts.onEvict
+  }
+
+  size(): number {
+    return this.entries.size
+  }
+
+  /** Count sessions in a single pool — useful for diagnostics / tests. */
+  sizeOf(category: SessionCategory): number {
+    let n = 0
+    for (const entry of this.entries.values()) {
+      if (entry.category === category) n += 1
+    }
+    return n
+  }
+
+  /**
+   * Insert a session. If the target pool is at capacity, the least-recently
+   * accessed eligible entry is evicted in the background (its
+   * `shutdown()` runs without blocking the new put). If no entry is
+   * eligible (everything pinned or within the recency floor), the insert
+   * still succeeds — the pool temporarily goes over capacity and we log
+   * a warning. This avoids blocking the foreground turn on eviction.
+   */
+  put(id: string, session: T, category: SessionCategory): void {
+    const now = Date.now()
+
+    // Replace-in-place if the same id was already registered. Don't
+    // shutdown the prior entry — callers use put() for fresh inserts;
+    // replacements come through the dedicated switch path in server.ts
+    // which already calls shutdown() explicitly.
+    if (this.entries.has(id)) {
+      const entry = this.entries.get(id)!
+      entry.session = session
+      entry.category = category
+      entry.lastAccess = now
+      entry.pinned = false
+      return
+    }
+
+    this.entries.set(id, { session, category, lastAccess: now, pinned: false })
+
+    const pool = this.pools[category]
+    if (this.sizeOf(category) > pool.maxSessions) {
+      const victim = this.pickEvictionVictim(category, now)
+      if (victim) {
+        this.entries.delete(victim.id)
+        // Fire-and-forget cleanup. Eviction logic must never block the
+        // foreground insert path. onEvict gives the caller a chance to
+        // drop external bookkeeping keyed on this id (IPC auth, context
+        // maps, active-turn sets) BEFORE we kill the subprocess, so any
+        // in-flight MCP call from the soon-to-be-dead session fails
+        // with a clean "unknown session" instead of hitting a dangling
+        // auth entry.
+        void this.runEviction(victim.id, victim.entry.session)
+      } else {
+        log.warn(
+          {
+            category,
+            poolSize: this.sizeOf(category),
+            cap: pool.maxSessions,
+          },
+          'session pool over capacity but no eligible victim (all pinned or within recency floor)',
+        )
+      }
+    }
+  }
+
+  /** Read and bump LRU recency. */
+  get(id: string): T | undefined {
+    const entry = this.entries.get(id)
+    if (!entry) return undefined
+    entry.lastAccess = Date.now()
+    return entry.session
+  }
+
+  /** Read without touching LRU order — for diagnostics / server shutdown. */
+  peek(id: string): T | undefined {
+    return this.entries.get(id)?.session
+  }
+
+  /** Whether an id is registered, without side-effects. */
+  has(id: string): boolean {
+    return this.entries.has(id)
+  }
+
+  /** Pin a session so it's ineligible for eviction. Safe to call on unknown ids. */
+  pin(id: string): void {
+    const entry = this.entries.get(id)
+    if (entry) entry.pinned = true
+  }
+
+  unpin(id: string): void {
+    const entry = this.entries.get(id)
+    if (entry) entry.pinned = false
+  }
+
+  /**
+   * Remove and shut down a session. Awaits `shutdown()` so callers that
+   * want to block on cleanup (session_destroy handler, server shutdown)
+   * can do so. No-op if the id isn't registered.
+   */
+  async delete(id: string, reason: 'explicit' | 'server-shutdown' = 'explicit'): Promise<void> {
+    const entry = this.entries.get(id)
+    if (!entry) return
+    this.entries.delete(id)
+    await this.runShutdown(id, entry.session, reason)
+  }
+
+  /** Shut down every registered session. Used on graceful server shutdown. */
+  async shutdownAll(): Promise<void> {
+    const snapshot = [...this.entries]
+    this.entries.clear()
+    await Promise.all(
+      snapshot.map(([id, entry]) => this.runShutdown(id, entry.session, 'server-shutdown')),
+    )
+  }
+
+  /** Iterate over entries — used by server.ts where we need to scan all sessions (e.g. MCP probe fallout). */
+  *[Symbol.iterator](): IterableIterator<[string, T]> {
+    for (const [id, entry] of this.entries) yield [id, entry.session]
+  }
+
+  /**
+   * Iterate over just the session values. Mirrors `Map.values()` so code
+   * that previously held `Map<string, Session>` doesn't need to know it's
+   * talking to a registry. Does NOT touch LRU recency.
+   */
+  *values(): IterableIterator<T> {
+    for (const entry of this.entries.values()) yield entry.session
+  }
+
+  // ── internal ─────────────────────────────────────────────────
+
+  private pickEvictionVictim(
+    category: SessionCategory,
+    now: number,
+  ): { id: string; entry: Entry<T> } | null {
+    const floor = this.pools[category].recencyFloorMs
+    let victimId: string | null = null
+    let victimEntry: Entry<T> | null = null
+
+    // Iterate in insertion order — Map preserves it. The first non-pinned,
+    // old-enough entry in the target category wins. LRU ordering is
+    // maintained by having `get()` re-touch entries; we don't move them
+    // to the end of the Map explicitly (that's an O(n) operation for
+    // large maps), because the "age" signal (lastAccess) is kept on the
+    // entry itself and we pick the oldest by timestamp below.
+    for (const [id, entry] of this.entries) {
+      if (entry.category !== category) continue
+      if (entry.pinned) continue
+      if (now - entry.lastAccess < floor) continue
+      if (!victimEntry || entry.lastAccess < victimEntry.lastAccess) {
+        victimId = id
+        victimEntry = entry
+      }
+    }
+
+    if (victimId !== null && victimEntry !== null) {
+      return { id: victimId, entry: victimEntry }
+    }
+    return null
+  }
+
+  private async runEviction(id: string, session: T): Promise<void> {
+    // Yield to the microtask queue before checking entries.get(id). An
+    // async function runs synchronously up to its first await, so
+    // without this line `void this.runEviction(...)` inside put() would
+    // execute the replacement check AND onEvict synchronously — before
+    // any subsequent synchronous put(sameId) call has a chance to
+    // repopulate the slot. Deferring here is what lets the race guard
+    // below actually detect replacement.
+    await Promise.resolve()
+
+    // Race guard: between `entries.delete(id)` (synchronous in `put()`)
+    // and this async eviction body running, a caller could have
+    // re-registered the same id with a different session — e.g. the
+    // chat auto-resume path in server.ts. Running `onEvict(id, …)` in
+    // that case would blindly wipe the NEW session's bookkeeping
+    // (IPC auth, harness context maps) because the cleanup is keyed on
+    // id, not on the session instance. Detect replacement and skip the
+    // external cleanup hook. We still run shutdown() on the OLD session
+    // so its subprocess is reaped.
+    if (this.onEvict) {
+      const current = this.entries.get(id)
+      const replaced = current !== undefined && current.session !== session
+      if (replaced) {
+        log.info(
+          { sessionId: id },
+          'eviction target was re-registered before onEvict fired — skipping external cleanup; new session owns the id',
+        )
+      } else {
+        try {
+          await Promise.resolve(this.onEvict(id, session))
+        } catch (err) {
+          log.warn(
+            { err: (err as Error).message, sessionId: id },
+            'onEvict threw — continuing with shutdown',
+          )
+        }
+      }
+    }
+    await this.runShutdown(id, session, 'eviction')
+  }
+
+  private async runShutdown(id: string, session: T, reason: string): Promise<void> {
+    const fn = session.shutdown
+    if (typeof fn !== 'function') {
+      log.debug({ sessionId: id, reason }, 'session has no shutdown() — nothing to do')
+      return
+    }
+    try {
+      await Promise.resolve(fn.call(session))
+      log.info({ sessionId: id, reason }, 'session shutdown complete')
+    } catch (err) {
+      log.warn(
+        { err: (err as Error).message, sessionId: id, reason },
+        'session shutdown threw — continuing',
+      )
+    }
+  }
+}
diff --git a/packages/agent-server/src/server.ts b/packages/agent-server/src/server.ts
index 089aa7a..f0c7b51 100644
--- a/packages/agent-server/src/server.ts
+++ b/packages/agent-server/src/server.ts
@@ -94,11 +94,14 @@ import {
   McpManager,
   type McpServerConfig,
   type Session,
+  SessionRegistry,
   type SubAgentEventHandler,
+  type ShimProbeResult,
   appendHarnessTurn,
   assembleConversationContext,
   buildHarnessCapabilityBlock,
   buildHarnessContextPrompt,
+  buildMcpSpawnConfig,
   buildReplaySeed,
   createMcpIpcServer,
   createSession,
@@ -108,6 +111,7 @@ import {
   hashPromptVersion,
   isHarnessSession,
   matchesSurface,
+  probeMcpShim,
   readHarnessHistory,
   resolveModel,
   resumeSession,
@@ -262,8 +266,46 @@ function spawnFallback(shell: string, cols: number, rows: number, cwd: string):
 export class AgentServer {
   private wss: WebSocketServer | null = null
   private config: AgentConfig
-  private sessions: Map<string, Session | HarnessSession | CodexHarnessSession> = new Map()
+  /**
+   * Bounded, LRU-ordered registry for every live session. Replaces a
+   * plain Map. Guarantees (a) `shutdown()` runs on eviction and explicit
+   * delete, and (b) partitioned pools so routine bursts can't evict live
+   * conversations. Category is passed at `put()` time by the call site
+   * that knows intent best.
+   */
+  private sessions: SessionRegistry<Session | HarnessSession | CodexHarnessSession> =
+    new SessionRegistry<Session | HarnessSession | CodexHarnessSession>({
+      // LRU eviction only calls session.shutdown() on the evicted entry.
+      // We also own per-session bookkeeping outside the registry
+      // (mcpIpcServer auth, harness context maps, activeTurns) that
+      // would otherwise leak on eviction. Drop them here — deliberately
+      // NOT deletePersistedSession, because eviction is a memory-pressure
+      // signal, not a "user destroyed this session" signal, and the disk
+      // state must remain resumable.
+      onEvict: (id, session) => {
+        this.cleanupEvictedSessionState(id, isHarnessSession(session))
+      },
+    })
   private activeTurns: Set<string> = new Set() // sessions currently processing a turn
+  /**
+   * Latest result from `probeMcpShim()`. `null` = not yet probed. Gates
+   * the capability block on harness session creation so the model
+   * doesn't hallucinate connectors when the shim is unreachable.
+   */
+  private mcpHealth: ShimProbeResult | null = null
+  /** Epoch ms when `mcpHealth` was last written — used by the lazy freshness check. */
+  private mcpHealthAt = 0
+  /** Shared in-flight re-probe promise so concurrent callers collapse into one spawn. */
+  private mcpProbeInflight: Promise<void> | null = null
+  /**
+   * Resolves once the first `probeMcpShim()` finishes (ok or err) so
+   * `start()` can block on it before the WebSocket listener opens.
+   * Without this await, clients that connect in the first few hundred ms
+   * after boot see `mcpHealth === null` at session creation and lose the
+   * capability block for the session's entire lifetime — Codex bakes
+   * `developerInstructions` into `thread/start` and never rebuilds it.
+   */
+  private initialMcpProbe: Promise<void> | null = null
   private activeClient: WebSocket | null = null
   // Track pending interactive prompts so they can be re-sent on client reconnect
   private pendingPrompts: Map<string, { type: string; payload: Record<string, unknown> }> =
@@ -347,6 +389,54 @@ export class AgentServer {
     // Start MCP IPC server for harness sessions
     const socketPath = join(getAntonDir(), 'harness.sock')
     this.mcpIpcServer = createMcpIpcServer(socketPath, this.toolRegistry)
+
+    // Fire the boot probe. Subsequent re-probes happen lazily in
+    // `ensureMcpHealthFresh()` when a harness session is about to be
+    // created and the last probe is older than MCP_HEALTH_STALE_MS. We
+    // don't poll on an interval — the shim binary only changes on
+    // deploy, so idle re-probes just burn cycles and fill logs.
+    this.initialMcpProbe = this.runMcpProbe()
+  }
+
+  /**
+   * Run one probe and update `mcpHealth` / `mcpHealthAt`. Concurrent
+   * callers share the in-flight promise via `mcpProbeInflight` so we
+   * never spawn two probe subprocesses in parallel.
+   */
+  private runMcpProbe(): Promise<void> {
+    if (this.mcpProbeInflight) return this.mcpProbeInflight
+    this.mcpProbeInflight = (async () => {
+      try {
+        this.mcpHealth = await probeMcpShim()
+      } catch (err) {
+        log.error({ err }, 'probeMcpShim threw — treating as unhealthy')
+        this.mcpHealth = {
+          ok: false,
+          error: (err as Error).message,
+          stderrTail: [],
+          durationMs: 0,
+        }
+      } finally {
+        this.mcpHealthAt = Date.now()
+        this.mcpProbeInflight = null
+      }
+    })()
+    return this.mcpProbeInflight
+  }
+
+  /**
+   * Ensure `mcpHealth` is no older than `staleMs`. If stale (or absent),
+   * re-probe inline. Callers await this before consulting `mcpHealth` to
+   * gate the capability block on harness session creation. Safe to call
+   * concurrently — shares the in-flight promise.
+   *
+   * Default staleness: 5 minutes. The probe only protects against partial
+   * deploys / broken builds, which are discrete events, so a short-ish
+   * freshness window bounds exposure without polling idly.
+   */
+  private async ensureMcpHealthFresh(staleMs = 5 * 60_000): Promise<void> {
+    if (this.mcpHealth && Date.now() - this.mcpHealthAt < staleMs) return
+    await this.runMcpProbe()
   }
 
   setScheduler(scheduler: Scheduler) {
@@ -370,16 +460,11 @@ export class AgentServer {
       }
       this.ptys.delete(id)
     }
-    // Shutdown harness sessions and IPC server
-    for (const [, session] of this.sessions) {
-      if (isHarnessSession(session)) {
-        try {
-          await session.shutdown()
-        } catch {
-          /* best-effort */
-        }
-      }
-    }
+    // Shutdown every registered session via the registry — harness
+    // sessions get SIGTERM→SIGKILL on their CLI; Pi SDK sessions are
+    // currently a no-op (no shutdown() method) but will pick it up for
+    // free if one is added later.
+    await this.sessions.shutdownAll()
     if (this.mcpIpcServer) {
       await this.mcpIpcServer.close()
       this.mcpIpcServer = null
@@ -409,7 +494,10 @@ export class AgentServer {
 
         // Agent sessions are autonomous — auto-approve everything
         this.wireAgentAutoHandlers(session)
-        this.sessions.set(runSessionId, session)
+        // Ephemeral: agent runs are one-shot; the finally-block below
+        // deletes explicitly, but if an exception races that path the
+        // registry's ephemeral pool caps the damage.
+        this.sessions.put(runSessionId, session, 'ephemeral')
 
         log.info({ runSessionId, agentSessionId }, 'Created fresh agent run session')
 
@@ -447,7 +535,7 @@ export class AgentServer {
           throw err
         } finally {
           // Always clean up cached session (persisted to disk by the session itself)
-          this.sessions.delete(runSessionId)
+          await this.sessions.delete(runSessionId)
         }
 
         return { eventCount, summary, runSessionId }
@@ -456,6 +544,15 @@ export class AgentServer {
   }
 
   async start(): Promise<void> {
+    // Block on the initial MCP shim probe so every harness session
+    // created after `start()` returns sees a definitive mcpHealth
+    // result. Probe timeout is 5s (see probeMcpShim), so startup is
+    // bounded even if the shim is broken.
+    if (this.initialMcpProbe) {
+      log.info('awaiting initial MCP shim probe before opening WebSocket listener')
+      await this.initialMcpProbe
+    }
+
     const { port } = this.config
     const tlsPort = port + 1
 
@@ -1370,7 +1467,9 @@ export class AgentServer {
     switch (msg.type) {
       // ── Session lifecycle ──
       case 'session_create':
-        this.handleSessionCreate(msg)
+        this.handleSessionCreate(msg).catch((err) => {
+          log.error({ err, sessionId: msg.id }, 'handleSessionCreate rejected unexpectedly')
+        })
         break
 
       case 'sessions_list':
@@ -1386,7 +1485,7 @@ export class AgentServer {
         break
 
       case 'session_destroy':
-        this.handleSessionDestroy(msg)
+        void this.handleSessionDestroy(msg)
         break
 
       case 'session_provider_switch':
@@ -1766,7 +1865,7 @@ export class AgentServer {
 
   // ── Session handlers ────────────────────────────────────────────
 
-  private handleSessionCreate(msg: {
+  private async handleSessionCreate(msg: {
     id: string
     provider?: string
     model?: string
@@ -1780,6 +1879,11 @@ export class AgentServer {
       const providerConfig = this.config.providers[providerName] || DEFAULT_PROVIDERS[providerName]
 
       if (providerConfig?.type === 'harness') {
+        // Re-probe if the last probe is stale. createHarnessSession bakes
+        // the capability-block decision into the CLI's system prompt at
+        // creation time — a stale "ok" result would ship connector tools
+        // the shim can't actually serve.
+        await this.ensureMcpHealthFresh()
         // Harness sessions are created via createHarnessSession() so the
         // same setup code runs for both fresh starts and provider-switch
         // rebuilds. See that method for the full wiring.
@@ -1807,7 +1911,7 @@ export class AgentServer {
         this.wireSessionConfirmHandler(session)
         this.wirePlanConfirmHandler(session)
         this.wireAskUserHandler(session)
-        this.sessions.set(msg.id, session)
+        this.sessions.put(msg.id, session, 'conversation')
 
         this.sendToClient(Channel.AI, {
           type: 'session_created',
@@ -1907,16 +2011,12 @@ export class AgentServer {
     // don't touch this variable — so we only build it for non-codex.
     const adapter = providerName === 'codex' ? null : new ClaudeAdapter()
     const socketPath = join(getAntonDir(), 'harness.sock')
-    const shimPath = join(
-      getAntonDir(),
-      '..',
-      'node_modules',
-      '@anton',
-      'agent-core',
-      'dist',
-      'harness',
-      'anton-mcp-shim.js',
-    )
+    // Shim path + node binary come from buildMcpSpawnConfig(), which
+    // resolves the shim via import.meta.url (package-owned) and uses
+    // process.execPath for the node binary. This replaces the previous
+    // `homedir() + '../node_modules/...'` construction which broke on
+    // VPS deployments where HOME != install root.
+    const mcpSpawn = buildMcpSpawnConfig()
 
     // Resolve workspace path from the project (if any)
     let cwd: string | undefined
@@ -2110,8 +2210,22 @@ export class AgentServer {
     }
 
     const liveConnectorsAtStart = this.collectLiveConnectorsForPrompt(surfaceLabel)
-    const capabilityBlock = buildHarnessCapabilityBlock(liveConnectorsAtStart, ANTON_MCP_NAMESPACE)
-    const liveConnectorIdsAtStart = liveConnectorsAtStart.map((c) => c.id)
+    // Gate on MCP shim health. If the shim probe failed (or hasn't run
+    // yet), we skip emitting the capability block entirely — the model
+    // would otherwise believe it can call connector tools when the
+    // transport is dead. We log once at creation so the operator can
+    // correlate missing tools with a known probe failure.
+    const mcpReady = this.mcpHealth?.ok === true
+    if (!mcpReady) {
+      log.warn(
+        { sessionId: id, mcpHealth: this.mcpHealth },
+        'MCP shim probe not ok — omitting capability block from harness session',
+      )
+    }
+    const capabilityBlock = mcpReady
+      ? buildHarnessCapabilityBlock(liveConnectorsAtStart, ANTON_MCP_NAMESPACE)
+      : ''
+    const liveConnectorIdsAtStart = mcpReady ? liveConnectorsAtStart.map((c) => c.id) : []
 
     const session: HarnessSession | CodexHarnessSession =
       providerName === 'codex'
@@ -2119,9 +2233,7 @@ export class AgentServer {
             id,
             provider: providerName,
             model,
-            socketPath,
-            shimPath,
-            authToken,
+            mcp: { socketPath, authToken, spawn: mcpSpawn },
             cwd,
             buildSystemPrompt,
             capabilityBlock,
@@ -2135,14 +2247,14 @@ export class AgentServer {
             // Non-codex branch: adapter is guaranteed non-null by
             // construction above (only the codex branch sets it to null).
             adapter: adapter ?? new ClaudeAdapter(),
-            socketPath,
-            shimPath,
-            authToken,
+            mcp: { socketPath, authToken, spawn: mcpSpawn },
             cwd,
             buildSystemPrompt,
             onTurnEnd,
           })
-    this.sessions.set(id, session)
+    // Harness sessions share the `conversation` pool with Pi SDK chats.
+    // Agent-run variants come through createSession with 'ephemeral'.
+    this.sessions.put(id, session, 'conversation')
 
     this.sendToClient(Channel.AI, {
       type: 'session_created',
@@ -2211,23 +2323,22 @@ export class AgentServer {
 
     const projectId = this.harnessSessionContexts.get(msg.id)?.projectId
 
-    // Tear down the old harness session. shutdown() waits for the CLI
-    // to exit gracefully (SIGTERM → SIGKILL ladder) so we don't leak
-    // the subprocess on a hot swap.
-    try {
-      await existing.shutdown()
-    } catch (err) {
-      log.warn(
-        { err, sessionId: msg.id },
-        'harness shutdown errored during provider switch — continuing',
-      )
-    }
+    // Drop external bookkeeping BEFORE the registry tears down the
+    // subprocess, mirroring the onEvict ordering — any in-flight MCP
+    // call from the soon-to-be-dead session fails with a clean "unknown
+    // session" instead of racing a dangling auth entry.
     if (this.mcpIpcServer) {
       this.mcpIpcServer.unregisterSession(msg.id)
     }
     this.harnessSessionContexts.delete(msg.id)
-    this.sessions.delete(msg.id)
     this.activeTurns.delete(msg.id)
+    // registry.delete() runs shutdown() — the SIGTERM → SIGKILL ladder
+    // inside HarnessSession.shutdown already waits for the CLI to exit.
+    // We deliberately don't call shutdown() ourselves first; it isn't
+    // cheap (proc.killed doesn't flip fast enough to short-circuit the
+    // second call, so a manual call + registry.delete would double the
+    // ~7s delay ladder on every provider switch).
+    await this.sessions.delete(msg.id)
 
     // Build the replay seed from the mirror. Empty string is fine —
     // means no prior history yet (switch before the first turn).
@@ -2242,6 +2353,10 @@ export class AgentServer {
       )
     }
 
+    // Re-probe if the last probe is stale — the rebuild below bakes the
+    // capability block into the new CLI's system prompt.
+    await this.ensureMcpHealthFresh()
+
     // Rebuild. createHarnessSession writes meta.json via
     // ensureHarnessSessionInit, which is a no-op if the file exists —
     // so the meta.json still reflects the ORIGINAL provider/model.
@@ -2625,24 +2740,65 @@ export class AgentServer {
     })
   }
 
-  private handleSessionDestroy(msg: { id: string }) {
+  /**
+   * In-memory bookkeeping that must be dropped when a session leaves the
+   * registry via LRU eviction. Intentionally does NOT touch disk state —
+   * eviction is a memory-pressure signal and the persisted file must
+   * remain resumable.
+   */
+  private cleanupEvictedSessionState(id: string, wasHarness: boolean): void {
+    this.activeTurns.delete(id)
+    if (wasHarness && this.mcpIpcServer) {
+      this.mcpIpcServer.unregisterSession(id)
+    }
+    if (wasHarness) {
+      this.harnessSessionContexts.delete(id)
+      this.harnessExtractionCursor.delete(id)
+    }
+  }
+
+  private async handleSessionDestroy(msg: { id: string }) {
     // Extract projectId before deleting so we can update stats
-    const session = this.sessions.get(msg.id)
+    const session = this.sessions.peek(msg.id)
     const projectId =
       (session && !isHarnessSession(session) ? session.contextInfo?.projectId : undefined) ??
       this.extractProjectId(msg.id)
     const wasHarness = session ? isHarnessSession(session) : false
 
+    let idReused = false
     try {
-      this.sessions.delete(msg.id)
-      this.activeTurns.delete(msg.id)
-      deletePersistedSession(msg.id, projectId)
-      if (wasHarness && this.mcpIpcServer) {
-        this.mcpIpcServer.unregisterSession(msg.id)
-      }
-      if (wasHarness) {
-        this.harnessSessionContexts.delete(msg.id)
-        this.harnessExtractionCursor.delete(msg.id)
+      // Registry.delete() awaits session.shutdown() when present — this
+      // is the leak fix. Previously the harness codex app-server (and
+      // its shim child) leaked until the host process exited, because
+      // only `this.sessions.delete(id)` was called.
+      await this.sessions.delete(msg.id)
+
+      // Race guard: while the delete() await was suspended on
+      // session.shutdown() (which for codex harness can take several
+      // seconds), another WebSocket message — typically session_create
+      // reusing the same id — could have run to completion and re-populated
+      // the registry + harnessSessionContexts + mcpIpcServer auth under
+      // this id. Running the id-keyed cleanup below would then clobber
+      // the live new session. If the id is back in the registry, the new
+      // session already owns these maps, so we skip all tail ops
+      // (including deletePersistedSession, which would delete the new
+      // session's on-disk meta.json / messages.jsonl).
+      idReused = this.sessions.has(msg.id)
+      if (idReused) {
+        log.info(
+          { sessionId: msg.id },
+          'session id reused during destroy — skipping tail cleanup; new session owns these maps',
+        )
+      } else {
+        this.activeTurns.delete(msg.id)
+        deletePersistedSession(msg.id, projectId)
+        if (wasHarness && this.mcpIpcServer) {
+          this.mcpIpcServer.unregisterSession(msg.id)
+        }
+        if (wasHarness) {
+          this.harnessSessionContexts.delete(msg.id)
+          this.harnessExtractionCursor.delete(msg.id)
+        }
       }
     } catch (err: unknown) {
       log.error({ err, sessionId: msg.id }, 'Error destroying session')
@@ -2653,8 +2809,10 @@ export class AgentServer {
       id: msg.id,
     })
 
-    // Update project stats so session count reflects the deletion
-    if (projectId) {
+    // Update project stats so session count reflects the deletion. Skip
+    // on id-reuse: the new session's create path will refresh stats on
+    // its own, and recalculating here would race with its disk writes.
+    if (projectId && !idReused) {
       try {
         updateProjectStats(projectId)
         const project = loadProject(projectId)
@@ -2672,7 +2830,7 @@ export class AgentServer {
       }
     }
 
-    log.info({ sessionId: msg.id }, 'Session destroyed')
+    log.info({ sessionId: msg.id, idReused }, 'Session destroyed')
   }
 
   private handleSessionHistory(msg: {
@@ -2751,7 +2909,7 @@ export class AgentServer {
         this.wireSessionConfirmHandler(session)
         this.wirePlanConfirmHandler(session)
         this.wireAskUserHandler(session)
-        this.sessions.set(msg.id, session)
+        this.sessions.put(msg.id, session, 'conversation')
       }
 
       // Pi SDK session: continue with existing logic.
@@ -4354,7 +4512,7 @@ export class AgentServer {
         this.wireSessionConfirmHandler(session)
         this.wirePlanConfirmHandler(session)
         this.wireAskUserHandler(session)
-        this.sessions.set(DEFAULT_SESSION_ID, session)
+        this.sessions.put(DEFAULT_SESSION_ID, session, 'conversation')
       } else {
         // Try to resume from disk automatically
         const projectId = this.extractProjectId(sessionId)
@@ -4390,7 +4548,10 @@ export class AgentServer {
             this.wirePlanConfirmHandler(session)
             this.wireAskUserHandler(session)
           }
-          this.sessions.set(sessionId, session)
+          // Agent sessions land in `routine` pool — they run on a schedule
+          // and naturally come and go; conversations go to the `conversation`
+          // pool where they compete for chat capacity.
+          this.sessions.put(sessionId, session, isAgentSession ? 'routine' : 'conversation')
           log.info({ sessionId }, 'Auto-resumed session from disk')
         } else {
           this.sendToClient(Channel.AI, {
@@ -4467,6 +4628,9 @@ export class AgentServer {
     })
 
     try {
+      // Pin inside the try so the finally's unpin always pairs with a pin,
+      // even if a throw fires between the two statements.
+      this.sessions.pin(sessionId)
       const turnStartMs = Date.now()
       let accumulatedText = ''
       let toolCallCount = 0
@@ -4690,6 +4854,7 @@ export class AgentServer {
       // the try block exited without reaching textBuffer.destroy()
       textBuffer.destroy()
       this.activeTurns.delete(sessionId)
+      this.sessions.unpin(sessionId)
       this.sendToClient(Channel.EVENTS, {
         type: 'routine_status',
         status: 'idle',
@@ -5081,15 +5246,35 @@ export class AgentServer {
         },
         // Harness session factory — lets the runner build Codex /
         // Claude Code sessions for Slack/Telegram with the same wiring
-        // desktop sessions use (IPC auth, tool registry, mirror, etc.)
-        ({ sessionId, providerName, model, projectId, surface }) =>
-          this.createHarnessSession({
+        // desktop sessions use (IPC auth, tool registry, mirror, etc.).
+        // Async so we can freshen mcpHealth before baking the capability
+        // block into the CLI's system prompt, matching the desktop path.
+        async ({ sessionId, providerName, model, projectId, surface }) => {
+          await this.ensureMcpHealthFresh()
+          return this.createHarnessSession({
             id: sessionId,
             providerName,
             model,
             projectId,
             surface,
-          }),
+          })
+        },
+        // Session disposer — mirrors handleSessionProviderSwitch's
+        // teardown order: drop IPC auth + harness context map BEFORE
+        // awaiting registry.delete (which runs session.shutdown()).
+        // Without this, any webhook eviction path (/model switch,
+        // switchAllSessionModels, /reset) orphans the codex/claude-code
+        // subprocess because the webhook runner's own Map entry was the
+        // only thing keeping tightly-scoped state, but the real session
+        // owner is the server's SessionRegistry.
+        async (sessionId) => {
+          if (this.mcpIpcServer) {
+            this.mcpIpcServer.unregisterSession(sessionId)
+          }
+          this.harnessSessionContexts.delete(sessionId)
+          this.activeTurns.delete(sessionId)
+          await this.sessions.delete(sessionId)
+        },
       )
       // Wire scheduler access so /agents command works on Telegram/Slack
       if (this.scheduler) {
diff --git a/packages/agent-server/src/webhooks/agent-runner.ts b/packages/agent-server/src/webhooks/agent-runner.ts
index b50454f..d5fbccd 100644
--- a/packages/agent-server/src/webhooks/agent-runner.ts
+++ b/packages/agent-server/src/webhooks/agent-runner.ts
@@ -138,7 +138,19 @@ export type HarnessSessionFactory = (opts: {
   model: string
   projectId?: string
   surface: string
-}) => HarnessSession | CodexHarnessSession
+}) => Promise<HarnessSession | CodexHarnessSession>
+
+/**
+ * Server-side session teardown hook. The runner holds its own sessions
+ * Map for per-surface state (chains, pending interactions, progress),
+ * but harness subprocesses and IPC auth live in the server's
+ * SessionRegistry + harnessSessionContexts. Dropping the Map entry
+ * alone leaks the subprocess; this callback asks the server to run
+ * `registry.delete(id)` (which awaits `shutdown()`) and evict the
+ * associated IPC/context bookkeeping. Fire-and-forget callers should
+ * `void` the returned promise.
+ */
+export type SessionDisposer = (sessionId: string) => Promise<void>
 
 export class WebhookAgentRunner {
   private sessions = new Map<string, Session | HarnessSession | CodexHarnessSession>()
@@ -182,8 +194,61 @@ export class WebhookAgentRunner {
      * on harness-only models like "gpt-5.4".
      */
     private harnessSessionFactory?: HarnessSessionFactory,
+    /**
+     * Optional — when set, session eviction paths (/model switch,
+     * switchAllSessionModels, /reset, etc.) call this to have the
+     * server drop its SessionRegistry entry + IPC auth + harness
+     * context maps. Without it, harness sessions leak their codex/
+     * claude-code subprocess on every eviction.
+     */
+    private sessionDisposer?: SessionDisposer,
   ) {}
 
+  /**
+   * Tear down a webhook session on eviction. Handles:
+   *   - local Map entry (queue continues to run on the old reference if
+   *     mid-turn, which is fine — JS keeps the object alive and the
+   *     shutdown() call sequenced by the server registry only triggers
+   *     after the queue drains if the subprocess still has in-flight
+   *     work)
+   *   - pending interactive prompts (clearTimeout + resolve with a
+   *     rejection so the blocked generator unblocks cleanly)
+   *   - throttled progress-message timers
+   *   - server-side registry + IPC auth, via `sessionDisposer`
+   *
+   * Safe to call on unknown ids; all steps are no-ops when state is
+   * absent.
+   */
+  private async disposeSession(sessionId: string): Promise<void> {
+    this.sessions.delete(sessionId)
+
+    const pending = this.pendingInteractions.get(sessionId)
+    if (pending) {
+      clearTimeout(pending.timeout)
+      this.pendingInteractions.delete(sessionId)
+      // Resolve as rejection so the awaiting handler inside the session
+      // generator unblocks instead of hanging until the interaction
+      // timeout (up to 24h for plan/ask_user). No feedback text — for
+      // plan_confirm and ask_user the feedback field is forwarded to the
+      // model as plan-revision input / the first question's answer, and
+      // we don't want "Session evicted." to leak into the conversation
+      // the way it used to.
+      pending.resolve({ approved: false })
+    }
+
+    const progress = this.progressStates.get(sessionId)
+    if (progress?.pendingTimer) clearTimeout(progress.pendingTimer)
+    this.progressStates.delete(sessionId)
+
+    if (this.sessionDisposer) {
+      try {
+        await this.sessionDisposer(sessionId)
+      } catch (err) {
+        log.warn({ err, sessionId }, 'sessionDisposer threw — continuing')
+      }
+    }
+  }
+
   /** Wire access to the scheduler's job list (called by the server after init). */
   setSchedulerJobsProvider(fn: typeof this.getSchedulerJobs): void {
     this.getSchedulerJobs = fn
@@ -197,7 +262,14 @@ export class WebhookAgentRunner {
     const bindingKey = extractBindingKey(sessionId)
     const ctx: CommandContext = {
       sessionId,
-      evictSession: () => this.sessions.delete(sessionId),
+      // CommandContext.evictSession is typed `() => void` because the
+      // slash-command handlers that call it are synchronous. Fire-and-
+      // forget the dispose — the next turn the user sends will create a
+      // fresh session, and the ~seconds-long harness shutdown runs in
+      // the background.
+      evictSession: () => {
+        void this.disposeSession(sessionId)
+      },
       // CommandContext.getSession is typed for Pi SDK Session — the
       // builtin /model and /status command handlers reach into
       // session.model/provider which both Session and HarnessSession
@@ -297,7 +369,7 @@ export class WebhookAgentRunner {
       () => this.runOne(event, provider),
     )
 
-    entry.tail = myTurn.finally(() => {
+    const settled = myTurn.finally(() => {
       entry.depth -= 1
       // Drop the chain entry once it's empty, so long-idle sessions don't
       // accumulate. Compare-and-set guards against a racing new arrival
@@ -306,6 +378,16 @@ export class WebhookAgentRunner {
         this.chains.delete(event.sessionId)
       }
     })
+    // Swallow rejections on the tail itself. The router awaits `myTurn`
+    // and handles its rejection; a subsequent event would attach an
+    // onRejected handler via `prevTail.then(..., ...)`. But if no next
+    // event arrives before the microtask drains, `settled` is a
+    // handler-less rejected promise and Node emits an
+    // unhandledRejection. The swallow here is decoupled from the chain:
+    // subsequent `.then(fn, fn)` still observes the rejection
+    // independently.
+    settled.catch(() => {})
+    entry.tail = settled
     this.chains.set(event.sessionId, entry)
 
     return myTurn
@@ -328,7 +410,7 @@ export class WebhookAgentRunner {
       'runOne: start',
     )
     try {
-      const { session, isNew } = this.getOrCreateSession(sessionId, event.surface)
+      const { session, isNew } = await this.getOrCreateSession(sessionId, event.surface)
 
       // Pre-flight: check if the session's provider has a usable API key.
       // Mirrors the env-var map in Session.resolveApiKey so we can catch this
@@ -652,12 +734,12 @@ export class WebhookAgentRunner {
           // Harness model lives on the session and is read at every CLI
           // spawn, so just mutating the field is enough — the next turn
           // picks it up. Switching providers (codex ↔ claude-code)
-          // would need a fresh HarnessSession; we drop the entry so the
-          // next message recreates it via the factory.
+          // would need a fresh HarnessSession; dispose so the subprocess
+          // is reaped and the next message rebuilds via the factory.
           if (session.provider === provider) {
             session.model = model
           } else {
-            this.sessions.delete(id)
+            void this.disposeSession(id)
           }
         } else {
           session.switchModel(provider, model)
@@ -739,7 +821,9 @@ export class WebhookAgentRunner {
       saveModelOverride(bindingKey, slug)
       // Drop the live session so the next message recreates it via the
       // override we just saved. Same eviction the text-based /model does.
-      this.sessions.delete(sessionId)
+      // Fire-and-forget: the user's ack message should post immediately;
+      // the background shutdown takes ~seconds for harness sessions.
+      void this.disposeSession(sessionId)
       next = {
         body: `\u2705 Model set to *${modelName}*${providerName ? ` on *${providerName}*` : ''}.\nYour next message will use it.`,
         rows: [[{ label: '\u2039\u2039 Back', action: 'm:open' }]],
@@ -886,10 +970,10 @@ export class WebhookAgentRunner {
     return !!(envVar && process.env[envVar])
   }
 
-  private getOrCreateSession(
+  private async getOrCreateSession(
     sessionId: string,
     surface?: SurfaceInfo,
-  ): { session: Session | HarnessSession | CodexHarnessSession; isNew: boolean } {
+  ): Promise<{ session: Session | HarnessSession | CodexHarnessSession; isNew: boolean }> {
     const existing = this.sessions.get(sessionId)
     if (existing) return { session: existing, isNew: false }
 
@@ -936,7 +1020,7 @@ export class WebhookAgentRunner {
         )
       }
       const surfaceLabel = surface?.kind ?? 'webhook'
-      const session = this.harnessSessionFactory({
+      const session = await this.harnessSessionFactory({
         sessionId,
         providerName,
         model,
diff --git a/specs/features/HARNESS_ARCHITECTURE.md b/specs/features/HARNESS_ARCHITECTURE.md
index 301ae57..590ba27 100644
--- a/specs/features/HARNESS_ARCHITECTURE.md
+++ b/specs/features/HARNESS_ARCHITECTURE.md
@@ -33,6 +33,8 @@
 │  transport: stdio ↔ Unix domain socket (no TCP, no port)    │
 │  relays: tools/list + tools/call to Anton's tool registry   │
 │  auth: per-session 32-byte token in ANTON_AUTH env          │
+│  spawn: buildMcpSpawnConfig() — execPath + import.meta.url  │
+│  health: probeMcpShim() on boot + every 60s                 │
 └─────────────────────────────────────────────────────────────┘
 
   *Gemini adapter is planned (Phase 5); not yet implemented.
@@ -115,6 +117,43 @@
 
 Shared block builders live in [`packages/agent-core/src/prompt-layers.ts`](../../packages/agent-core/src/prompt-layers.ts) — same module Pi SDK's `Session.getSystemPrompt()` uses for the overlap layers. Single source.
 
+## MCP shim spawn + health
+
+### Spawn config (single source of truth)
+
+`buildMcpSpawnConfig()` in [`packages/agent-core/src/harness/mcp-spawn-config.ts`](../../packages/agent-core/src/harness/mcp-spawn-config.ts) is the **only** place that constructs the shim command line. Both `HarnessSession` and `CodexHarnessSession` accept a `mcp.spawn: McpSpawnConfig` from the server, so there's no per-adapter path logic.
+
+```ts
+interface McpSpawnConfig {
+  command: string    // always process.execPath
+  args: string[]     // [ SHIM_PATH ]
+  shimPath: string   // absolute path to anton-mcp-shim.js on disk
+}
+```
+
+Two invariants the config enforces:
+
+- **`command = process.execPath`** — not the literal `"node"`. systemd services don't inherit PATH reliably; `execPath` is guaranteed to be the node binary currently running the server. This matters on VPS deployments running under `NodeJS` + systemd.
+- **`shimPath` via `import.meta.url`** — resolved from the module's own location on disk, independent of `HOME` or cwd. The previous implementation composed the path from `homedir() + '../node_modules/...'`, which broke on VPS where the host process runs as `anton@/home/anton` but the install actually lives at `/opt/anton`.
+
+### Health probe
+
+`probeMcpShim(spawnConfig, timeoutMs)` spawns the shim in isolation, completes one `initialize` JSON-RPC round-trip, and tears down. It's called on server boot and then every 60s (the interval is `.unref()`ed so it doesn't hold the event loop open).
+
+The probe confirms:
+1. The shim binary exists at the expected path.
+2. Node loads it without syntax errors or missing imports.
+3. JSON-RPC framing works end to end.
+4. Reported `serverInfo.version` matches `getExpectedShimVersion()` (warn on mismatch — symptom of a partial deploy).
+
+It does **not** verify the IPC auth path; that's exercised by every live session the first time a tool is called.
+
+### Capability-block gating
+
+If the most recent probe failed (`mcpHealth?.ok === false`), the server **omits the harness capability block** from the system prompt and passes an empty connector list to `buildHarnessContextPrompt`. The model therefore never believes it has tools it can't call — preventing the failure mode where a broken shim path caused harness CLIs to confidently attempt `anton:gmail_search_emails` against a shim that couldn't be spawned.
+
+The probe's stderr tail and resolved `shimPath` are logged at error level on every failed interval so ops can diagnose deploy issues without shelling into the box.
+
 ## MCP tool surface (what the CLI sees via `tools/list`)
 
 Shipped today — all served by `AntonToolRegistry` over the Unix socket:
@@ -208,6 +247,8 @@ Desktop `interactionHandler` decorates the rendered message with actionable pref
 | — | **Background memory extraction** — `runHarnessMemoryExtraction` fire-and-forget after each turn | ✅ Shipped |
 | — | **Provider switch** — `session_provider_switch` message, `buildReplaySeed`, `createHarnessSession` refactor, desktop `HarnessProviderSwitch` component | ✅ Shipped |
 | — | **Codex adapter — mcp_tool_call** — new item type + field names (server/tool/object args+result) | ✅ Shipped |
+| — | **MCP production hardening** — `buildMcpSpawnConfig` single-source spawn, `probeMcpShim` health check on boot + 60s, capability-block gating on probe failure, version handshake, `process.execPath` instead of `"node"` | ✅ Shipped |
+| — | **Session lifecycle** — `SessionRegistry` with partitioned LRU pools, pin-during-turn, awaited shutdown on `session_destroy`; see [SESSION_LIFECYCLE.md](./SESSION_LIFECYCLE.md) | ✅ Shipped |
 | 5 | **Gemini adapter + per-provider tuning** | ⏳ Planned |
 | — | **Per-provider prompt variants** | ⏳ Planned (revisit with usage telemetry) |
 | — | **External HTTP MCP server** | ⏳ Planned (separate spec) |
@@ -222,7 +263,9 @@ Desktop `interactionHandler` decorates the rendered message with actionable pref
 | Codex adapter | `packages/agent-core/src/harness/adapters/codex.ts` |
 | Session lifecycle | `packages/agent-core/src/harness/harness-session.ts` |
 | MCP shim (stdio↔IPC relay) | `packages/agent-core/src/harness/anton-mcp-shim.ts` |
+| MCP spawn config + health probe | `packages/agent-core/src/harness/mcp-spawn-config.ts` |
 | IPC server (auth + tool dispatch) | `packages/agent-core/src/harness/mcp-ipc-handler.ts` |
+| Session registry (LRU + shutdown) | `packages/agent-core/src/session-registry.ts` |
 | Tool registry | `packages/agent-core/src/harness/tool-registry.ts` |
 | Shared tool catalog | `packages/agent-core/src/tools/factories.ts` (`buildAntonCoreTools`) |
 | Per-tool factories | `packages/agent-core/src/tools/{memory,database,notification,publish,activate-workflow,update-project-context}.ts` |
diff --git a/specs/features/SESSION_LIFECYCLE.md b/specs/features/SESSION_LIFECYCLE.md
new file mode 100644
index 0000000..1484c03
--- /dev/null
+++ b/specs/features/SESSION_LIFECYCLE.md
@@ -0,0 +1,101 @@
+# Session Lifecycle — Registry, Eviction, Shutdown
+
+> **Status:** authoritative description as of Apr 2026.
+> **Companion to:** [HARNESS_ARCHITECTURE.md](./HARNESS_ARCHITECTURE.md) (session ownership and turn flow within a single conversation).
+> **Scope:** every live session object the server holds — Pi SDK `Session`, `HarnessSession`, `CodexHarnessSession`, and scheduled agent runs.
+
+## Why a registry exists
+
+Pre-registry, `server.ts` stored sessions in an unbounded `Map<string, Session | HarnessSession | CodexHarnessSession>` and relied on explicit `session_destroy` messages from the client to clean up. Two production failures fell out of that:
+
+1. **`handleSessionDestroy` leak.** The handler deleted the Map entry but never `await`ed `session.shutdown()`. For Pi SDK sessions this was harmless (no owned subprocess). For harness sessions this meant the `codex app-server` subprocess and its `anton-mcp-shim` child survived until the host process died. Over hours of normal use a VPS would accumulate tens of orphaned `node` + `codex` processes.
+
+2. **Unbounded accumulation.** Nothing evicted sessions the client forgot about — crashed browsers, abandoned tabs, long-running desktops. Each harness session holds ~30 MB RSS between codex + shim + our own state, so the set grew without bound until memory pressure ended the process.
+
+The registry solves both with the same contract, so every session type gets the same guarantees.
+
+## Contract
+
+`SessionRegistry<T extends Shutdownable>` lives in [`packages/agent-core/src/session-registry.ts`](../../packages/agent-core/src/session-registry.ts). The server instantiates one for all session types:
+
+```ts
+private sessions: SessionRegistry<Session | HarnessSession | CodexHarnessSession>
+```
+
+### Category (required at put-time)
+
+Every session declares its category when inserted. The category discriminates eviction pools so a burst of short-lived runs can't evict live conversations.
+
+| Category | Used for | Default pool |
+|---|---|---|
+| `conversation` | user-driven chat (Pi SDK or harness) | `{maxSessions: 40, recencyFloorMs: 30_000}` |
+| `routine` | agent-manager routines / scheduled jobs | `{maxSessions: 40, recencyFloorMs: 30_000}` |
+| `ephemeral` | sub-agent spawns, publish jobs, forks | `{maxSessions: 20, recencyFloorMs: 10_000}` |
+
+Defaults live in `DEFAULT_POOLS`. The server uses them as-is; tests override per-case.
+
+### Methods
+
+| Method | Touches LRU? | Awaits shutdown? | Blocking? |
+|---|---|---|---|
+| `put(id, session, category)` | yes (insert timestamp) | no | yes (eviction is fire-and-forget) |
+| `get(id)` | yes | n/a | no |
+| `peek(id)` | no | n/a | no |
+| `has(id)` | no | n/a | no |
+| `pin(id)` / `unpin(id)` | no | n/a | no |
+| `delete(id, reason?)` | n/a | **yes** | yes |
+| `shutdownAll()` | n/a | yes (in parallel) | yes |
+| `size()` / `sizeOf(category)` | no | n/a | no |
+
+### Invariants
+
+- **`delete()` awaits `shutdown()`.** Callers that need to block on cleanup (the `session_destroy` handler, graceful server shutdown) can do so by awaiting the return value.
+- **Eviction never blocks the foreground put.** When a pool hits capacity, the victim's `shutdown()` runs via `void`. Foreground turn latency is independent of eviction.
+- **Replace-in-place on duplicate id does NOT shut down the prior session.** Provider-switch paths explicitly `await shutdown()` before replacing.
+- **`shutdown()` throwing does not break eviction or delete.** Errors are logged at warn level and the session is still removed from the registry.
+- **A session without `shutdown()` (Pi SDK `Session`) is handled as a no-op.** The `Shutdownable` interface marks `shutdown` optional.
+
+## Eviction policy
+
+When `put()` would exceed a pool's `maxSessions`, the registry picks the **least-recently-accessed, non-pinned, older-than-floor** entry in the same pool and shuts it down in the background.
+
+If no entry is eligible (everything is pinned or within the recency floor), the `put()` still succeeds — the pool temporarily goes over capacity and the registry logs a warn. This avoids blocking the foreground turn on eviction and keeps the failure mode visible (ops sees the warn, capacity tuning follows).
+
+The recency floor prevents thrash when the pool is at capacity and a new entry arrives in the same breath as the last one. Without it, two quick turns could evict each other.
+
+## Pinning during active turns
+
+`server.ts` pins a session for the duration of `processMessage` / `runHarness` / agent-run flows:
+
+```ts
+this.activeTurns.add(sessionId)
+this.sessions.pin(sessionId)
+try {
+  // ... stream turn ...
+} finally {
+  this.activeTurns.delete(sessionId)
+  this.sessions.unpin(sessionId)
+}
+```
+
+This guarantees a mid-stream eviction can't rip the session out from under a turn that's still writing to stdout. Routines and ephemeral runs use the same pattern.
+
+## Server-shutdown path
+
+`this.sessions.shutdownAll()` fans out `shutdown()` across every category in parallel. This replaces the pre-registry loop that iterated the Map and awaited sequentially.
+
+## Interaction with the MCP probe
+
+The registry's `shutdown()` path ensures that the codex subprocess **and** its spawned shim child are torn down on every eviction or explicit destroy. This closes the socket connection and unregisters the per-session auth token in the IPC handler, so a stale registry entry can never be revived by a shim reconnect attempt. The MCP health probe documented in [HARNESS_ARCHITECTURE.md](./HARNESS_ARCHITECTURE.md#mcp-shim-spawn--health) is independent of the registry — it runs even when no sessions exist — but it shares the same `buildMcpSpawnConfig` source of truth, so a shim that fails the probe would also fail for any new session the registry accepts.
+
+## Integration tests
+
+- [`packages/agent-core/src/__fixtures__/check-session-registry.ts`](../../packages/agent-core/src/__fixtures__/check-session-registry.ts) — LRU ordering, pool partitioning, pinning, recency-floor warn path, `delete()` awaits shutdown, replace-in-place, `peek()` no-touch, `shutdown()` error handling.
+
+  Run with: `pnpm --filter @anton/agent-core check:session-registry`
+
+## Non-goals
+
+- **Cross-session LRU policy.** Pools are independent; we don't move sessions between categories.
+- **Persistence of the registry itself.** Sessions are re-instantiated lazily from disk storage (`messages.jsonl` / meta.json) when the client reconnects — the registry is a pure in-memory structure.
+- **Timer-based idle eviction.** Eviction is triggered only by `put()` pressure. Adding an idle timer is trivial if measurements show value; today there's no evidence it's needed once the bounded pools are in place.