diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index c65a885..945146e 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -19,7 +19,7 @@ import { printBanner } from "../ascii-banner.js"; import { loadChallengesIntoPKC, formatChallengeNameVersion } from "../../challenge-packages/challenge-utils.js"; import { migrateDataDirectory } from "../../common-utils/data-migration.js"; import { createBsoResolvers, DEFAULT_PROVIDERS } from "../../common-utils/resolvers.js"; -import { pruneStaleStates, writeDaemonState, deleteDaemonState } from "../../common-utils/daemon-state.js"; +import { pruneStaleStates, writeDaemonState, deleteDaemonState, DAEMON_SHUTDOWN_TIMEOUT_MS } from "../../common-utils/daemon-state.js"; import { createDaemonFileLogger, type DaemonFileLogger } from "../../common-utils/daemon-file-logger.js"; import fs from "fs"; import fsPromise from "fs/promises"; @@ -538,6 +538,16 @@ export default class Daemon extends Command { }; const killKuboProcess = async () => { + // Test hook (issue #70): hold off the kubo teardown for a fixed delay so a test can + // deterministically reproduce the window where the daemon's RPC port is already free + // (daemonServer.destroy() runs in parallel) but kubo is still alive and bound. This is + // exactly the window `update install` must not restart into — see + // test/cli/update-install-restart-race.test.ts. The daemon process stays alive for the + // duration because the exit hook awaits killKuboProcess() before exiting. + const kuboShutdownDelayRaw = process.env["PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS"]; + const kuboShutdownDelay = kuboShutdownDelayRaw ? Number(kuboShutdownDelayRaw) : 0; + if (Number.isFinite(kuboShutdownDelay) && kuboShutdownDelay > 0) + await new Promise((resolve) => setTimeout(resolve, kuboShutdownDelay)); // Wait (bounded) for any in-flight start attempt so we kill the kubo it may still // spawn. Both promises settle on all failure paths (issue #70), but a spawned kubo // that wedges before "Daemon is ready" without exiting keeps them pending — the @@ -611,7 +621,7 @@ export default class Daemon extends Command { await kuboKillPromise; }, - { wait: 120000 } // could take two minutes to shut down + { wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down ); // Emergency cleanup: if the process force-exits (e.g. double Ctrl+C), diff --git a/src/cli/commands/update/install.ts b/src/cli/commands/update/install.ts index 54922ae..da2e60f 100644 --- a/src/cli/commands/update/install.ts +++ b/src/cli/commands/update/install.ts @@ -4,7 +4,7 @@ import tcpPortUsed from "tcp-port-used"; import { fetchLatestVersion, installGlobal } from "../../../update/npm-registry.js"; import { fastInstallGlobal } from "../../../update/fast-update.js"; import { compareVersions } from "../../../update/semver.js"; -import { getAliveDaemonStates, type DaemonState } from "../../../common-utils/daemon-state.js"; +import { getAliveDaemonStates, DAEMON_SHUTDOWN_TIMEOUT_MS, type DaemonState } from "../../../common-utils/daemon-state.js"; export default class Install extends Command { static override description = "Install a specific version of bitsocial from npm"; @@ -65,15 +65,20 @@ export default class Install extends Command { } } - // Wait for all daemon ports to be free + // Wait for each daemon process to fully exit — NOT just for its RPC port to free. + // The daemon releases its RPC port (daemonServer.destroy()) before it finishes killing + // its kubo child, so a port-only wait lets us restart while the old kubo still holds the + // IPFS API port; the new daemon then dies on startup with "IPFS API port already in use" + // (issue #70). The daemon's exit hook kills kubo before the process exits, so waiting for + // the PID to disappear guarantees the kubo port is free before we restart. for (const d of aliveDaemons) { - const url = new URL(d.pkcRpcUrl); - const port = Number(url.port); - const host = url.hostname; - this.log(`Waiting for port ${port} to be free...`); - const freed = await tcpPortUsed.waitUntilFree(port, 500, 30000).then(() => true).catch(() => false); - if (!freed) { - this.error(`Daemon (PID ${d.pid}) did not shut down within 30 seconds on port ${port}.`, { exit: 1 }); + this.log(`Waiting for daemon (PID ${d.pid}) to exit...`); + const exited = await this._waitForProcessExit(d.pid, DAEMON_SHUTDOWN_TIMEOUT_MS); + if (!exited) { + this.error( + `Daemon (PID ${d.pid}) did not shut down within ${DAEMON_SHUTDOWN_TIMEOUT_MS / 1000} seconds.`, + { exit: 1 } + ); } } this.log("All daemons stopped."); @@ -134,6 +139,30 @@ export default class Install extends Command { } } + /** + * Poll until the given PID no longer exists (signal 0 throws ESRCH), or the timeout elapses. + * Returns true if the process exited, false on timeout. EPERM means the process is still alive + * but owned by another user, so we keep waiting. + */ + private async _waitForProcessExit(pid: number, timeoutMs: number): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + process.kill(pid, 0); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === "ESRCH") return true; // no such process — it exited + } + await new Promise((resolve) => setTimeout(resolve, 250)); + } + // Final check so a process that exits in the last interval isn't reported as a timeout + try { + process.kill(pid, 0); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === "ESRCH") return true; + } + return false; + } + private async _restartDaemons(daemons: DaemonState[]): Promise { this.log(`Restarting ${daemons.length} daemon(s)...`); diff --git a/src/common-utils/daemon-state.ts b/src/common-utils/daemon-state.ts index 70697f6..18c3aac 100644 --- a/src/common-utils/daemon-state.ts +++ b/src/common-utils/daemon-state.ts @@ -8,6 +8,14 @@ const execFileAsync = promisify(execFile); const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states"); +/** + * Maximum time a daemon is allowed to shut down its kubo + RPC server during its + * async exit hook. The `update install --restart-daemons` orchestrator must wait at + * least this long for a stopped daemon's PID to disappear before giving up — otherwise + * a slow-but-valid shutdown (within the daemon's own contract) aborts the update midway. + */ +export const DAEMON_SHUTDOWN_TIMEOUT_MS = 120000; + export interface DaemonState { pid: number; startedAt: string; diff --git a/test/cli/update-install-restart-race.test.ts b/test/cli/update-install-restart-race.test.ts new file mode 100644 index 0000000..64c5c63 --- /dev/null +++ b/test/cli/update-install-restart-race.test.ts @@ -0,0 +1,185 @@ +// Reproduces the `bitsocial update install` restart race (issue #70). +// +// `update install` stops the running daemon, then restarts it with the new binary. The bug: +// it only waits for the daemon's *RPC port* to free before restarting. But the RPC port is +// released by daemonServer.destroy() — which happens *before* the daemon finishes killing its +// kubo child. So the new daemon can be spawned while the old kubo still holds the IPFS API +// port, and it dies on startup with "Cannot start IPFS daemon because the IPFS API port ... +// is already in use" — port 9138 never comes up. This is exactly what was observed in prod +// after updating in place. +// +// The fix makes `update install` wait for the old daemon's PID to actually exit before +// restarting. The daemon's exit hook kills kubo before the process exits, so "PID gone" +// guarantees the kubo API port is free. +// +// How this test pins the race down deterministically: +// * PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS makes the old daemon hold its kubo alive (and its +// process alive) for a fixed window after SIGINT, while the RPC port frees immediately. +// * A `bitsocial` PATH shim is what `update install` spawns for the restart. It records +// whether the kubo API port is still in use *at the moment of restart*, then exec's the +// real daemon. That marker — not the eventual daemon state, which self-heals via the +// watchdog — is the red/green discriminator. +// +// Isolation: the daemon-state directory `update install` enumerates lives under env-paths' +// data dir, which is derived from HOME on every platform (XDG_DATA_HOME only on Linux). We +// override HOME (and XDG_DATA_HOME) for the daemon and the install command so this test sees +// only its own daemon and never stops/restarts other tests' daemons running in parallel. + +import { spawn } from "child_process"; +import { describe, it, expect, afterEach } from "vitest"; +import { directory as randomDirectory } from "tempy"; +import { readFileSync } from "fs"; +import fs from "fs/promises"; +import path from "path"; +import { + stopPkcDaemon, + startPkcDaemon, + ensureKuboNodeStopped, + waitForKuboReady, + type ManagedChildProcess +} from "../helpers/daemon-helpers.js"; + +// Ports unique to this file (avoid collisions with other test files and external processes). +const RPC_PORT = 9468; +const KUBO_PORT = 50121; +const GATEWAY_PORT = 6581; +const RPC_URL = `ws://localhost:${RPC_PORT}`; +const KUBO_URL = `http://0.0.0.0:${KUBO_PORT}/api/v0`; +const KUBO_API_URL = `http://localhost:${KUBO_PORT}/api/v0`; +const GATEWAY_URL = `http://0.0.0.0:${GATEWAY_PORT}`; + +const CLI_VERSION = JSON.parse(readFileSync(path.join(process.cwd(), "package.json"), "utf-8")).version as string; + +// How long the old daemon holds kubo alive after SIGINT. Long enough that, with the buggy +// (RPC-port-only) wait, the restart is guaranteed to land inside the window. +const KUBO_SHUTDOWN_DELAY_MS = 8000; + +const runUpdateInstall = (env: Record): Promise<{ exitCode: number | null; stdout: string; stderr: string }> => { + return new Promise((resolve, reject) => { + const proc = spawn("node", ["./bin/run", "update", "install", CLI_VERSION], { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, ...env } + }); + let stdout = ""; + let stderr = ""; + proc.stdout.on("data", (d) => (stdout += d.toString())); + proc.stderr.on("data", (d) => (stderr += d.toString())); + proc.on("error", reject); + proc.on("close", (exitCode) => resolve({ exitCode, stdout, stderr })); + }); +}; + +describe("bitsocial update install restart race (issue #70)", async () => { + let daemonA: ManagedChildProcess | undefined; + let restartedPidFile: string | undefined; + + afterEach(async () => { + if (daemonA) await stopPkcDaemon(daemonA); + daemonA = undefined; + // The restarted daemon (B) is detached from update install — the shim recorded its PID. + if (restartedPidFile) { + const pids = (await fs.readFile(restartedPidFile, "utf-8").catch(() => "")) + .split("\n") + .map((s) => Number(s.trim())) + .filter((n) => Number.isInteger(n) && n > 0); + for (const pid of pids) { + try { + process.kill(pid, "SIGINT"); + } catch { + /* already gone */ + } + } + } + restartedPidFile = undefined; + await ensureKuboNodeStopped(KUBO_API_URL); + }); + + it.skipIf(process.platform === "win32")( + "does not restart the daemon until the old kubo has released the IPFS API port", + { timeout: 120000 }, + async () => { + // Isolate the daemon-state directory cross-platform (see file header). + const isolatedHome = randomDirectory(); + const tmpBin = randomDirectory(); + const markerFile = path.join(randomDirectory(), "restart-kubo-port.marker"); + const pidFile = path.join(path.dirname(markerFile), "restart-daemon.pids"); + restartedPidFile = pidFile; + const repoRoot = process.cwd(); + + // The check script the shim runs: record whether the kubo API port is still bound + // at the instant update install spawns the restart. + const checkScript = path.join(tmpBin, "record-kubo-port.cjs"); + await fs.writeFile( + checkScript, + [ + `const net = require("net");`, + `const fs = require("fs");`, + `const port = Number(new URL(process.env.KUBO_RPC_URL).port);`, + `const marker = process.env.PKC_CLI_TEST_RESTART_MARKER;`, + `const s = net.connect(port, "127.0.0.1");`, + `s.on("connect", () => { fs.appendFileSync(marker, "inuse\\n"); s.destroy(); process.exit(0); });`, + `s.on("error", () => { fs.appendFileSync(marker, "free\\n"); process.exit(0); });`, + `setTimeout(() => { try { s.destroy(); } catch {} process.exit(0); }, 3000);` + ].join("\n") + ); + + // The `bitsocial` shim update install will spawn for the restart: record the kubo port + // state, record this process's PID (it becomes the real daemon via exec), then become + // the real daemon so it actually comes back up. + const shim = path.join(tmpBin, "bitsocial"); + await fs.writeFile( + shim, + [ + `#!/bin/sh`, + `node "${checkScript}"`, + `echo "$$" >> "${pidFile}"`, + `exec node "${repoRoot}/bin/run" "$@"` + ].join("\n") + "\n" + ); + await fs.chmod(shim, 0o755); + + const sharedEnv = { + HOME: isolatedHome, + XDG_DATA_HOME: path.join(isolatedHome, ".local", "share"), + KUBO_RPC_URL: KUBO_URL, + IPFS_GATEWAY_URL: GATEWAY_URL + }; + + await ensureKuboNodeStopped(KUBO_API_URL); + + // Start daemon A — a real daemon with a real kubo, writing its state file into the + // isolated home so update install discovers only this daemon. + daemonA = await startPkcDaemon(["--pkcRpcUrl", RPC_URL], { + ...sharedEnv, + PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS: String(KUBO_SHUTDOWN_DELAY_MS) + }); + expect(typeof daemonA.pid).toBe("number"); + expect(await waitForKuboReady(KUBO_API_URL, 45000)).toBe(true); + + // Run the real `bitsocial update install `: same version => skips npm, + // but runs the full stop + _restartDaemons path. The shim (first on PATH) is what it + // spawns for the restart. + const result = await runUpdateInstall({ + ...sharedEnv, + PATH: `${tmpBin}:${process.env.PATH}`, + PKC_CLI_TEST_RESTART_MARKER: markerFile + }); + + // The command itself must have succeeded — a non-zero exit means the restart path + // failed even if the marker observations happen to look right. + expect(result.exitCode, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(0); + + const marker = await fs.readFile(markerFile, "utf-8").catch(() => ""); + const observations = marker.trim().split("\n").filter(Boolean); + + // Sanity: exactly one daemon was discovered and restarted (the shim ran once). If this + // is >1 the state dir wasn't isolated and we picked up another test's daemon. + expect(observations.length, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(1); + + // The fix's contract: when update install restarts the daemon, the old kubo must already + // be gone. With the bug (wait for RPC port only) this is "inuse" => the restarted daemon + // collides on the IPFS API port. + expect(observations[0]).toBe("free"); + } + ); +}); diff --git a/test/common-utils/daemon-state.test.ts b/test/common-utils/daemon-state.test.ts index 2221839..a30d7f4 100644 --- a/test/common-utils/daemon-state.test.ts +++ b/test/common-utils/daemon-state.test.ts @@ -122,7 +122,14 @@ describe("daemon-state", () => { // and on the host PID 8 belongs to a kernel thread — an alive but unrelated process. The // bare `process.kill(pid, 0)` liveness check passed, so `update install` SIGINT'd the // unrelated process and restarted the daemon twice on the same port. - describe("getAliveDaemonStates — PID reused by an unrelated process", () => { + // + // Skipped on Windows: the PID-reuse scenario (issue #66) is a Docker-on-Linux problem + // (a container PID colliding with a host kernel thread), and the identity check that + // detects it relies on Unix process introspection (/proc, `ps`) plus Unix tooling + // (`sleep`, `bash`). On Windows the identity is undeterminable, so the code intentionally + // degrades to liveness-only — the conservative, safe fallback. There is nothing + // Windows-specific to assert here. + describe.skipIf(process.platform === "win32")("getAliveDaemonStates — PID reused by an unrelated process", () => { const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states"); let child: ChildProcess | undefined; diff --git a/test/kubo/kuboRpcGateway.integration.test.ts b/test/kubo/kuboRpcGateway.integration.test.ts index 04533e5..5f5d3a1 100644 --- a/test/kubo/kuboRpcGateway.integration.test.ts +++ b/test/kubo/kuboRpcGateway.integration.test.ts @@ -176,20 +176,42 @@ describe("kubo RPC + gateway integration", { timeout: 120_000 }, () => { let ipfsRepoPath: string; beforeAll(async () => { - const dataPath = tempDirectory(); - ipfsRepoPath = path.join(dataPath, "ipfs-repo"); - process.env.IPFS_PATH = ipfsRepoPath; - - const apiPort = await getAvailablePort(); - const gatewayPort = await getAvailablePort(); - apiUrl = new URL(`http://127.0.0.1:${apiPort}`); - gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`); - - await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl); + // kubo binds the API/gateway ports itself, so there's an unavoidable window between + // getAvailablePort() closing its probe socket and kubo binding the port where another + // process can claim it. The collision surfaces either as a pre-spawn port check throw + // ("...is already in use.") or as kubo losing the gateway bind on startup and exiting + // prematurely ("serveHTTPGateway: ... address already in use"). Both reject the awaited + // startKuboNode() promise with a message containing "already in use", so retry on that + // with a fresh repo + fresh ports. Anything else is a real failure and is re-thrown. + const maxAttempts = 4; + let lastError: unknown; + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + const dataPath = tempDirectory(); + ipfsRepoPath = path.join(dataPath, "ipfs-repo"); + process.env.IPFS_PATH = ipfsRepoPath; + + const apiPort = await getAvailablePort(); + const gatewayPort = await getAvailablePort(); + apiUrl = new URL(`http://127.0.0.1:${apiPort}`); + gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`); + + await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl); - kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath); - - await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" })); + try { + kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath); + await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" })); + return; + } catch (error) { + lastError = error; + const message = error instanceof Error ? error.message : String(error); + // On rejection startKuboNode never leaks a live process (it either never spawned + // or already exited), so there's nothing to kill before the next attempt. + kuboProcess = undefined; + if (!/already in use/i.test(message) || attempt === maxAttempts) throw error; + } + } + // Unreachable: the loop either returns on success or throws. Keeps lastError referenced. + throw lastError; }); afterAll(async () => {