From 73458b45f0133dd3541ad55081fb452ec2b42b5b Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 07:30:54 +0000 Subject: [PATCH 1/5] fix(update): wait for daemon process exit, not RPC port, before restart (issue #70) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `bitsocial update install --restart-daemons` only waited for the old daemon's RPC port to free before restarting it. The RPC port is released by daemonServer.destroy() *before* the daemon finishes killing its kubo child, so the new daemon could be spawned while the old kubo still held the IPFS API port and die on startup with "IPFS API port already in use" — port 9138 never comes up. Seen in prod after an in-place update. Wait for the old daemon's PID to actually exit instead. The daemon's exit hook kills kubo before the process exits, so a gone PID guarantees the kubo API port is free before we restart. Adds an end-to-end test driving the real `update install` (same-version so it skips npm, a `bitsocial` PATH shim that records whether the kubo port is still bound at restart time) plus a deterministic timing hook (PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS). The test isolates the daemon-state directory by overriding HOME (env-paths derives it from HOME on every platform; XDG_DATA_HOME only applies on Linux) so it never enumerates or restarts other tests' daemons running in parallel. --- src/cli/commands/daemon.ts | 10 + src/cli/commands/update/install.ts | 42 ++++- test/cli/update-install-restart-race.test.ts | 181 +++++++++++++++++++ 3 files changed, 225 insertions(+), 8 deletions(-) create mode 100644 test/cli/update-install-restart-race.test.ts diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index c65a885..10d2eeb 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -538,6 +538,16 @@ export default class Daemon extends Command { }; const killKuboProcess = async () => { + // Test hook (issue #70): hold off the kubo teardown for a fixed delay so a test can + // deterministically reproduce the window where the daemon's RPC port is already free + // (daemonServer.destroy() runs in parallel) but kubo is still alive and bound. This is + // exactly the window `update install` must not restart into — see + // test/cli/update-install-restart-race.test.ts. The daemon process stays alive for the + // duration because the exit hook awaits killKuboProcess() before exiting. + const kuboShutdownDelayRaw = process.env["PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS"]; + const kuboShutdownDelay = kuboShutdownDelayRaw ? Number(kuboShutdownDelayRaw) : 0; + if (Number.isFinite(kuboShutdownDelay) && kuboShutdownDelay > 0) + await new Promise((resolve) => setTimeout(resolve, kuboShutdownDelay)); // Wait (bounded) for any in-flight start attempt so we kill the kubo it may still // spawn. Both promises settle on all failure paths (issue #70), but a spawned kubo // that wedges before "Daemon is ready" without exiting keeps them pending — the diff --git a/src/cli/commands/update/install.ts b/src/cli/commands/update/install.ts index 54922ae..50c2491 100644 --- a/src/cli/commands/update/install.ts +++ b/src/cli/commands/update/install.ts @@ -65,15 +65,17 @@ export default class Install extends Command { } } - // Wait for all daemon ports to be free + // Wait for each daemon process to fully exit — NOT just for its RPC port to free. + // The daemon releases its RPC port (daemonServer.destroy()) before it finishes killing + // its kubo child, so a port-only wait lets us restart while the old kubo still holds the + // IPFS API port; the new daemon then dies on startup with "IPFS API port already in use" + // (issue #70). The daemon's exit hook kills kubo before the process exits, so waiting for + // the PID to disappear guarantees the kubo port is free before we restart. for (const d of aliveDaemons) { - const url = new URL(d.pkcRpcUrl); - const port = Number(url.port); - const host = url.hostname; - this.log(`Waiting for port ${port} to be free...`); - const freed = await tcpPortUsed.waitUntilFree(port, 500, 30000).then(() => true).catch(() => false); - if (!freed) { - this.error(`Daemon (PID ${d.pid}) did not shut down within 30 seconds on port ${port}.`, { exit: 1 }); + this.log(`Waiting for daemon (PID ${d.pid}) to exit...`); + const exited = await this._waitForProcessExit(d.pid, 60000); + if (!exited) { + this.error(`Daemon (PID ${d.pid}) did not shut down within 60 seconds.`, { exit: 1 }); } } this.log("All daemons stopped."); @@ -134,6 +136,30 @@ export default class Install extends Command { } } + /** + * Poll until the given PID no longer exists (signal 0 throws ESRCH), or the timeout elapses. + * Returns true if the process exited, false on timeout. EPERM means the process is still alive + * but owned by another user, so we keep waiting. + */ + private async _waitForProcessExit(pid: number, timeoutMs: number): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + process.kill(pid, 0); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === "ESRCH") return true; // no such process — it exited + } + await new Promise((resolve) => setTimeout(resolve, 250)); + } + // Final check so a process that exits in the last interval isn't reported as a timeout + try { + process.kill(pid, 0); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === "ESRCH") return true; + } + return false; + } + private async _restartDaemons(daemons: DaemonState[]): Promise { this.log(`Restarting ${daemons.length} daemon(s)...`); diff --git a/test/cli/update-install-restart-race.test.ts b/test/cli/update-install-restart-race.test.ts new file mode 100644 index 0000000..91d21b4 --- /dev/null +++ b/test/cli/update-install-restart-race.test.ts @@ -0,0 +1,181 @@ +// Reproduces the `bitsocial update install` restart race (issue #70). +// +// `update install` stops the running daemon, then restarts it with the new binary. The bug: +// it only waits for the daemon's *RPC port* to free before restarting. But the RPC port is +// released by daemonServer.destroy() — which happens *before* the daemon finishes killing its +// kubo child. So the new daemon can be spawned while the old kubo still holds the IPFS API +// port, and it dies on startup with "Cannot start IPFS daemon because the IPFS API port ... +// is already in use" — port 9138 never comes up. This is exactly what was observed in prod +// after updating in place. +// +// The fix makes `update install` wait for the old daemon's PID to actually exit before +// restarting. The daemon's exit hook kills kubo before the process exits, so "PID gone" +// guarantees the kubo API port is free. +// +// How this test pins the race down deterministically: +// * PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS makes the old daemon hold its kubo alive (and its +// process alive) for a fixed window after SIGINT, while the RPC port frees immediately. +// * A `bitsocial` PATH shim is what `update install` spawns for the restart. It records +// whether the kubo API port is still in use *at the moment of restart*, then exec's the +// real daemon. That marker — not the eventual daemon state, which self-heals via the +// watchdog — is the red/green discriminator. +// +// Isolation: the daemon-state directory `update install` enumerates lives under env-paths' +// data dir, which is derived from HOME on every platform (XDG_DATA_HOME only on Linux). We +// override HOME (and XDG_DATA_HOME) for the daemon and the install command so this test sees +// only its own daemon and never stops/restarts other tests' daemons running in parallel. + +import { spawn } from "child_process"; +import { describe, it, expect, afterEach } from "vitest"; +import { directory as randomDirectory } from "tempy"; +import { readFileSync } from "fs"; +import fs from "fs/promises"; +import path from "path"; +import { + stopPkcDaemon, + startPkcDaemon, + ensureKuboNodeStopped, + waitForKuboReady, + type ManagedChildProcess +} from "../helpers/daemon-helpers.js"; + +// Ports unique to this file (avoid collisions with other test files and external processes). +const RPC_PORT = 9468; +const KUBO_PORT = 50121; +const GATEWAY_PORT = 6581; +const RPC_URL = `ws://localhost:${RPC_PORT}`; +const KUBO_URL = `http://0.0.0.0:${KUBO_PORT}/api/v0`; +const KUBO_API_URL = `http://localhost:${KUBO_PORT}/api/v0`; +const GATEWAY_URL = `http://0.0.0.0:${GATEWAY_PORT}`; + +const CLI_VERSION = JSON.parse(readFileSync(path.join(process.cwd(), "package.json"), "utf-8")).version as string; + +// How long the old daemon holds kubo alive after SIGINT. Long enough that, with the buggy +// (RPC-port-only) wait, the restart is guaranteed to land inside the window. +const KUBO_SHUTDOWN_DELAY_MS = 8000; + +const runUpdateInstall = (env: Record): Promise<{ exitCode: number | null; stdout: string; stderr: string }> => { + return new Promise((resolve, reject) => { + const proc = spawn("node", ["./bin/run", "update", "install", CLI_VERSION], { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, ...env } + }); + let stdout = ""; + let stderr = ""; + proc.stdout.on("data", (d) => (stdout += d.toString())); + proc.stderr.on("data", (d) => (stderr += d.toString())); + proc.on("error", reject); + proc.on("close", (exitCode) => resolve({ exitCode, stdout, stderr })); + }); +}; + +describe("bitsocial update install restart race (issue #70)", async () => { + let daemonA: ManagedChildProcess | undefined; + let restartedPidFile: string | undefined; + + afterEach(async () => { + if (daemonA) await stopPkcDaemon(daemonA); + daemonA = undefined; + // The restarted daemon (B) is detached from update install — the shim recorded its PID. + if (restartedPidFile) { + const pids = (await fs.readFile(restartedPidFile, "utf-8").catch(() => "")) + .split("\n") + .map((s) => Number(s.trim())) + .filter((n) => Number.isInteger(n) && n > 0); + for (const pid of pids) { + try { + process.kill(pid, "SIGINT"); + } catch { + /* already gone */ + } + } + } + restartedPidFile = undefined; + await ensureKuboNodeStopped(KUBO_API_URL); + }); + + it.skipIf(process.platform === "win32")( + "does not restart the daemon until the old kubo has released the IPFS API port", + { timeout: 120000 }, + async () => { + // Isolate the daemon-state directory cross-platform (see file header). + const isolatedHome = randomDirectory(); + const tmpBin = randomDirectory(); + const markerFile = path.join(randomDirectory(), "restart-kubo-port.marker"); + const pidFile = path.join(path.dirname(markerFile), "restart-daemon.pids"); + restartedPidFile = pidFile; + const repoRoot = process.cwd(); + + // The check script the shim runs: record whether the kubo API port is still bound + // at the instant update install spawns the restart. + const checkScript = path.join(tmpBin, "record-kubo-port.cjs"); + await fs.writeFile( + checkScript, + [ + `const net = require("net");`, + `const fs = require("fs");`, + `const port = Number(new URL(process.env.KUBO_RPC_URL).port);`, + `const marker = process.env.PKC_CLI_TEST_RESTART_MARKER;`, + `const s = net.connect(port, "127.0.0.1");`, + `s.on("connect", () => { fs.appendFileSync(marker, "inuse\\n"); s.destroy(); process.exit(0); });`, + `s.on("error", () => { fs.appendFileSync(marker, "free\\n"); process.exit(0); });`, + `setTimeout(() => { try { s.destroy(); } catch {} process.exit(0); }, 3000);` + ].join("\n") + ); + + // The `bitsocial` shim update install will spawn for the restart: record the kubo port + // state, record this process's PID (it becomes the real daemon via exec), then become + // the real daemon so it actually comes back up. + const shim = path.join(tmpBin, "bitsocial"); + await fs.writeFile( + shim, + [ + `#!/bin/sh`, + `node "${checkScript}"`, + `echo "$$" >> "${pidFile}"`, + `exec node "${repoRoot}/bin/run" "$@"` + ].join("\n") + "\n" + ); + await fs.chmod(shim, 0o755); + + const sharedEnv = { + HOME: isolatedHome, + XDG_DATA_HOME: path.join(isolatedHome, ".local", "share"), + KUBO_RPC_URL: KUBO_URL, + IPFS_GATEWAY_URL: GATEWAY_URL + }; + + await ensureKuboNodeStopped(KUBO_API_URL); + + // Start daemon A — a real daemon with a real kubo, writing its state file into the + // isolated home so update install discovers only this daemon. + daemonA = await startPkcDaemon(["--pkcRpcUrl", RPC_URL], { + ...sharedEnv, + PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS: String(KUBO_SHUTDOWN_DELAY_MS) + }); + expect(typeof daemonA.pid).toBe("number"); + expect(await waitForKuboReady(KUBO_API_URL, 45000)).toBe(true); + + // Run the real `bitsocial update install `: same version => skips npm, + // but runs the full stop + _restartDaemons path. The shim (first on PATH) is what it + // spawns for the restart. + const result = await runUpdateInstall({ + ...sharedEnv, + PATH: `${tmpBin}:${process.env.PATH}`, + PKC_CLI_TEST_RESTART_MARKER: markerFile + }); + + const marker = await fs.readFile(markerFile, "utf-8").catch(() => ""); + const observations = marker.trim().split("\n").filter(Boolean); + + // Sanity: exactly one daemon was discovered and restarted (the shim ran once). If this + // is >1 the state dir wasn't isolated and we picked up another test's daemon. + expect(observations.length, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(1); + + // The fix's contract: when update install restarts the daemon, the old kubo must already + // be gone. With the bug (wait for RPC port only) this is "inuse" => the restarted daemon + // collides on the IPFS API port. + expect(observations[0]).toBe("free"); + } + ); +}); From 86803866ebc34df004dabf5d719b186dff020447 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 08:31:59 +0000 Subject: [PATCH 2/5] fix(update): align restart wait budget with daemon shutdown budget The update-install restart flow waited only 60s for a stopped daemon's PID to exit, but the daemon's async exit hook is given 120s to shut down kubo + the RPC server. A slow-but-valid shutdown (60-120s) would abort `update install --restart-daemons` midway with daemons stopped and nothing installed. Hoist a shared DAEMON_SHUTDOWN_TIMEOUT_MS constant and drive both the daemon exit hook and the update-install wait from it, so the orchestrator's patience matches the daemon's contract. Folds the timeout into the error message so the two can't drift. Addresses CodeRabbit review on #74. --- src/cli/commands/daemon.ts | 4 ++-- src/cli/commands/update/install.ts | 9 ++++++--- src/common-utils/daemon-state.ts | 8 ++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index 10d2eeb..945146e 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -19,7 +19,7 @@ import { printBanner } from "../ascii-banner.js"; import { loadChallengesIntoPKC, formatChallengeNameVersion } from "../../challenge-packages/challenge-utils.js"; import { migrateDataDirectory } from "../../common-utils/data-migration.js"; import { createBsoResolvers, DEFAULT_PROVIDERS } from "../../common-utils/resolvers.js"; -import { pruneStaleStates, writeDaemonState, deleteDaemonState } from "../../common-utils/daemon-state.js"; +import { pruneStaleStates, writeDaemonState, deleteDaemonState, DAEMON_SHUTDOWN_TIMEOUT_MS } from "../../common-utils/daemon-state.js"; import { createDaemonFileLogger, type DaemonFileLogger } from "../../common-utils/daemon-file-logger.js"; import fs from "fs"; import fsPromise from "fs/promises"; @@ -621,7 +621,7 @@ export default class Daemon extends Command { await kuboKillPromise; }, - { wait: 120000 } // could take two minutes to shut down + { wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down ); // Emergency cleanup: if the process force-exits (e.g. double Ctrl+C), diff --git a/src/cli/commands/update/install.ts b/src/cli/commands/update/install.ts index 50c2491..da2e60f 100644 --- a/src/cli/commands/update/install.ts +++ b/src/cli/commands/update/install.ts @@ -4,7 +4,7 @@ import tcpPortUsed from "tcp-port-used"; import { fetchLatestVersion, installGlobal } from "../../../update/npm-registry.js"; import { fastInstallGlobal } from "../../../update/fast-update.js"; import { compareVersions } from "../../../update/semver.js"; -import { getAliveDaemonStates, type DaemonState } from "../../../common-utils/daemon-state.js"; +import { getAliveDaemonStates, DAEMON_SHUTDOWN_TIMEOUT_MS, type DaemonState } from "../../../common-utils/daemon-state.js"; export default class Install extends Command { static override description = "Install a specific version of bitsocial from npm"; @@ -73,9 +73,12 @@ export default class Install extends Command { // the PID to disappear guarantees the kubo port is free before we restart. for (const d of aliveDaemons) { this.log(`Waiting for daemon (PID ${d.pid}) to exit...`); - const exited = await this._waitForProcessExit(d.pid, 60000); + const exited = await this._waitForProcessExit(d.pid, DAEMON_SHUTDOWN_TIMEOUT_MS); if (!exited) { - this.error(`Daemon (PID ${d.pid}) did not shut down within 60 seconds.`, { exit: 1 }); + this.error( + `Daemon (PID ${d.pid}) did not shut down within ${DAEMON_SHUTDOWN_TIMEOUT_MS / 1000} seconds.`, + { exit: 1 } + ); } } this.log("All daemons stopped."); diff --git a/src/common-utils/daemon-state.ts b/src/common-utils/daemon-state.ts index 70697f6..18c3aac 100644 --- a/src/common-utils/daemon-state.ts +++ b/src/common-utils/daemon-state.ts @@ -8,6 +8,14 @@ const execFileAsync = promisify(execFile); const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states"); +/** + * Maximum time a daemon is allowed to shut down its kubo + RPC server during its + * async exit hook. The `update install --restart-daemons` orchestrator must wait at + * least this long for a stopped daemon's PID to disappear before giving up — otherwise + * a slow-but-valid shutdown (within the daemon's own contract) aborts the update midway. + */ +export const DAEMON_SHUTDOWN_TIMEOUT_MS = 120000; + export interface DaemonState { pid: number; startedAt: string; From f3f403ca32f8222ab14136fdbb7d48bf3ad225a2 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 09:55:47 +0000 Subject: [PATCH 3/5] test(daemon-state): skip PID-reuse identity tests on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PID-reuse scenario (issue #66) is a Docker-on-Linux problem and the identity check that detects it relies on Unix process introspection (/proc, ps) plus Unix tooling (sleep, bash). On Windows the identity is undeterminable, so the code intentionally degrades to liveness-only — the safe fallback. The test asserted Unix-only behavior, so it failed deterministically on windows-latest CI. --- test/common-utils/daemon-state.test.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/common-utils/daemon-state.test.ts b/test/common-utils/daemon-state.test.ts index 2221839..a30d7f4 100644 --- a/test/common-utils/daemon-state.test.ts +++ b/test/common-utils/daemon-state.test.ts @@ -122,7 +122,14 @@ describe("daemon-state", () => { // and on the host PID 8 belongs to a kernel thread — an alive but unrelated process. The // bare `process.kill(pid, 0)` liveness check passed, so `update install` SIGINT'd the // unrelated process and restarted the daemon twice on the same port. - describe("getAliveDaemonStates — PID reused by an unrelated process", () => { + // + // Skipped on Windows: the PID-reuse scenario (issue #66) is a Docker-on-Linux problem + // (a container PID colliding with a host kernel thread), and the identity check that + // detects it relies on Unix process introspection (/proc, `ps`) plus Unix tooling + // (`sleep`, `bash`). On Windows the identity is undeterminable, so the code intentionally + // degrades to liveness-only — the conservative, safe fallback. There is nothing + // Windows-specific to assert here. + describe.skipIf(process.platform === "win32")("getAliveDaemonStates — PID reused by an unrelated process", () => { const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states"); let child: ChildProcess | undefined; From 3c986e05270e818d4843d7130d39c29dff004655 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 10:11:26 +0000 Subject: [PATCH 4/5] test(update): assert update install exits 0 before checking restart marker Addresses CodeRabbit review on PR #74: the restart-race test inferred success only indirectly via the marker observations. Add an explicit exitCode === 0 assertion so a non-zero exit from the command itself fails the test even when the marker happens to look right. --- test/cli/update-install-restart-race.test.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cli/update-install-restart-race.test.ts b/test/cli/update-install-restart-race.test.ts index 91d21b4..64c5c63 100644 --- a/test/cli/update-install-restart-race.test.ts +++ b/test/cli/update-install-restart-race.test.ts @@ -165,6 +165,10 @@ describe("bitsocial update install restart race (issue #70)", async () => { PKC_CLI_TEST_RESTART_MARKER: markerFile }); + // The command itself must have succeeded — a non-zero exit means the restart path + // failed even if the marker observations happen to look right. + expect(result.exitCode, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(0); + const marker = await fs.readFile(markerFile, "utf-8").catch(() => ""); const observations = marker.trim().split("\n").filter(Boolean); From 475a9439dd173d838ee7dc5abff6b6976cd69492 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 10:24:45 +0000 Subject: [PATCH 5/5] test(kubo): retry RPC+gateway integration startup on port race The kubo RPC + gateway integration beforeAll picked ephemeral ports with getAvailablePort(), which closes its probe socket before returning the port number. kubo binds the port itself afterwards, leaving a TOCTOU window where another process can claim it. On CI this surfaced as the gateway failing to bind (serveHTTPGateway: ... address already in use) and the suite failing flakily. Wrap startup in a bounded retry (4 attempts) that picks fresh ports and a fresh repo on any 'already in use' rejection, and re-throws every other error so real regressions still fail fast. --- test/kubo/kuboRpcGateway.integration.test.ts | 48 ++++++++++++++------ 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/test/kubo/kuboRpcGateway.integration.test.ts b/test/kubo/kuboRpcGateway.integration.test.ts index 04533e5..5f5d3a1 100644 --- a/test/kubo/kuboRpcGateway.integration.test.ts +++ b/test/kubo/kuboRpcGateway.integration.test.ts @@ -176,20 +176,42 @@ describe("kubo RPC + gateway integration", { timeout: 120_000 }, () => { let ipfsRepoPath: string; beforeAll(async () => { - const dataPath = tempDirectory(); - ipfsRepoPath = path.join(dataPath, "ipfs-repo"); - process.env.IPFS_PATH = ipfsRepoPath; - - const apiPort = await getAvailablePort(); - const gatewayPort = await getAvailablePort(); - apiUrl = new URL(`http://127.0.0.1:${apiPort}`); - gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`); - - await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl); + // kubo binds the API/gateway ports itself, so there's an unavoidable window between + // getAvailablePort() closing its probe socket and kubo binding the port where another + // process can claim it. The collision surfaces either as a pre-spawn port check throw + // ("...is already in use.") or as kubo losing the gateway bind on startup and exiting + // prematurely ("serveHTTPGateway: ... address already in use"). Both reject the awaited + // startKuboNode() promise with a message containing "already in use", so retry on that + // with a fresh repo + fresh ports. Anything else is a real failure and is re-thrown. + const maxAttempts = 4; + let lastError: unknown; + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + const dataPath = tempDirectory(); + ipfsRepoPath = path.join(dataPath, "ipfs-repo"); + process.env.IPFS_PATH = ipfsRepoPath; + + const apiPort = await getAvailablePort(); + const gatewayPort = await getAvailablePort(); + apiUrl = new URL(`http://127.0.0.1:${apiPort}`); + gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`); + + await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl); - kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath); - - await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" })); + try { + kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath); + await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" })); + return; + } catch (error) { + lastError = error; + const message = error instanceof Error ? error.message : String(error); + // On rejection startKuboNode never leaks a live process (it either never spawned + // or already exited), so there's nothing to kill before the next attempt. + kuboProcess = undefined; + if (!/already in use/i.test(message) || attempt === maxAttempts) throw error; + } + } + // Unreachable: the loop either returns on success or throws. Keeps lastError referenced. + throw lastError; }); afterAll(async () => {