Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/cli/commands/daemon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import { printBanner } from "../ascii-banner.js";
import { loadChallengesIntoPKC, formatChallengeNameVersion } from "../../challenge-packages/challenge-utils.js";
import { migrateDataDirectory } from "../../common-utils/data-migration.js";
import { createBsoResolvers, DEFAULT_PROVIDERS } from "../../common-utils/resolvers.js";
import { pruneStaleStates, writeDaemonState, deleteDaemonState } from "../../common-utils/daemon-state.js";
import { pruneStaleStates, writeDaemonState, deleteDaemonState, DAEMON_SHUTDOWN_TIMEOUT_MS } from "../../common-utils/daemon-state.js";
import { createDaemonFileLogger, type DaemonFileLogger } from "../../common-utils/daemon-file-logger.js";
import fs from "fs";
import fsPromise from "fs/promises";
Expand Down Expand Up @@ -538,6 +538,16 @@ export default class Daemon extends Command {
};

const killKuboProcess = async () => {
// Test hook (issue #70): hold off the kubo teardown for a fixed delay so a test can
// deterministically reproduce the window where the daemon's RPC port is already free
// (daemonServer.destroy() runs in parallel) but kubo is still alive and bound. This is
// exactly the window `update install` must not restart into — see
// test/cli/update-install-restart-race.test.ts. The daemon process stays alive for the
// duration because the exit hook awaits killKuboProcess() before exiting.
const kuboShutdownDelayRaw = process.env["PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS"];
const kuboShutdownDelay = kuboShutdownDelayRaw ? Number(kuboShutdownDelayRaw) : 0;
if (Number.isFinite(kuboShutdownDelay) && kuboShutdownDelay > 0)
await new Promise((resolve) => setTimeout(resolve, kuboShutdownDelay));
// Wait (bounded) for any in-flight start attempt so we kill the kubo it may still
// spawn. Both promises settle on all failure paths (issue #70), but a spawned kubo
// that wedges before "Daemon is ready" without exiting keeps them pending — the
Expand Down Expand Up @@ -611,7 +621,7 @@ export default class Daemon extends Command {

await kuboKillPromise;
},
{ wait: 120000 } // could take two minutes to shut down
{ wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down
);

// Emergency cleanup: if the process force-exits (e.g. double Ctrl+C),
Expand Down
47 changes: 38 additions & 9 deletions src/cli/commands/update/install.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import tcpPortUsed from "tcp-port-used";
import { fetchLatestVersion, installGlobal } from "../../../update/npm-registry.js";
import { fastInstallGlobal } from "../../../update/fast-update.js";
import { compareVersions } from "../../../update/semver.js";
import { getAliveDaemonStates, type DaemonState } from "../../../common-utils/daemon-state.js";
import { getAliveDaemonStates, DAEMON_SHUTDOWN_TIMEOUT_MS, type DaemonState } from "../../../common-utils/daemon-state.js";

export default class Install extends Command {
static override description = "Install a specific version of bitsocial from npm";
Expand Down Expand Up @@ -65,15 +65,20 @@ export default class Install extends Command {
}
}

// Wait for all daemon ports to be free
// Wait for each daemon process to fully exit — NOT just for its RPC port to free.
// The daemon releases its RPC port (daemonServer.destroy()) before it finishes killing
// its kubo child, so a port-only wait lets us restart while the old kubo still holds the
// IPFS API port; the new daemon then dies on startup with "IPFS API port already in use"
// (issue #70). The daemon's exit hook kills kubo before the process exits, so waiting for
// the PID to disappear guarantees the kubo port is free before we restart.
for (const d of aliveDaemons) {
const url = new URL(d.pkcRpcUrl);
const port = Number(url.port);
const host = url.hostname;
this.log(`Waiting for port ${port} to be free...`);
const freed = await tcpPortUsed.waitUntilFree(port, 500, 30000).then(() => true).catch(() => false);
if (!freed) {
this.error(`Daemon (PID ${d.pid}) did not shut down within 30 seconds on port ${port}.`, { exit: 1 });
this.log(`Waiting for daemon (PID ${d.pid}) to exit...`);
const exited = await this._waitForProcessExit(d.pid, DAEMON_SHUTDOWN_TIMEOUT_MS);
if (!exited) {
this.error(
`Daemon (PID ${d.pid}) did not shut down within ${DAEMON_SHUTDOWN_TIMEOUT_MS / 1000} seconds.`,
{ exit: 1 }
);
}
}
this.log("All daemons stopped.");
Expand Down Expand Up @@ -134,6 +139,30 @@ export default class Install extends Command {
}
}

/**
* Poll until the given PID no longer exists (signal 0 throws ESRCH), or the timeout elapses.
* Returns true if the process exited, false on timeout. EPERM means the process is still alive
* but owned by another user, so we keep waiting.
*/
private async _waitForProcessExit(pid: number, timeoutMs: number): Promise<boolean> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
try {
process.kill(pid, 0);
} catch (e) {
if ((e as NodeJS.ErrnoException).code === "ESRCH") return true; // no such process — it exited
}
await new Promise((resolve) => setTimeout(resolve, 250));
}
// Final check so a process that exits in the last interval isn't reported as a timeout
try {
process.kill(pid, 0);
} catch (e) {
if ((e as NodeJS.ErrnoException).code === "ESRCH") return true;
}
return false;
}

private async _restartDaemons(daemons: DaemonState[]): Promise<void> {
this.log(`Restarting ${daemons.length} daemon(s)...`);

Expand Down
8 changes: 8 additions & 0 deletions src/common-utils/daemon-state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ const execFileAsync = promisify(execFile);

const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states");

/**
* Maximum time a daemon is allowed to shut down its kubo + RPC server during its
* async exit hook. The `update install --restart-daemons` orchestrator must wait at
* least this long for a stopped daemon's PID to disappear before giving up — otherwise
* a slow-but-valid shutdown (within the daemon's own contract) aborts the update midway.
*/
export const DAEMON_SHUTDOWN_TIMEOUT_MS = 120000;

export interface DaemonState {
pid: number;
startedAt: string;
Expand Down
185 changes: 185 additions & 0 deletions test/cli/update-install-restart-race.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
// Reproduces the `bitsocial update install` restart race (issue #70).
//
// `update install` stops the running daemon, then restarts it with the new binary. The bug:
// it only waits for the daemon's *RPC port* to free before restarting. But the RPC port is
// released by daemonServer.destroy() — which happens *before* the daemon finishes killing its
// kubo child. So the new daemon can be spawned while the old kubo still holds the IPFS API
// port, and it dies on startup with "Cannot start IPFS daemon because the IPFS API port ...
// is already in use" — port 9138 never comes up. This is exactly what was observed in prod
// after updating in place.
//
// The fix makes `update install` wait for the old daemon's PID to actually exit before
// restarting. The daemon's exit hook kills kubo before the process exits, so "PID gone"
// guarantees the kubo API port is free.
//
// How this test pins the race down deterministically:
// * PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS makes the old daemon hold its kubo alive (and its
// process alive) for a fixed window after SIGINT, while the RPC port frees immediately.
// * A `bitsocial` PATH shim is what `update install` spawns for the restart. It records
// whether the kubo API port is still in use *at the moment of restart*, then exec's the
// real daemon. That marker — not the eventual daemon state, which self-heals via the
// watchdog — is the red/green discriminator.
//
// Isolation: the daemon-state directory `update install` enumerates lives under env-paths'
// data dir, which is derived from HOME on every platform (XDG_DATA_HOME only on Linux). We
// override HOME (and XDG_DATA_HOME) for the daemon and the install command so this test sees
// only its own daemon and never stops/restarts other tests' daemons running in parallel.

import { spawn } from "child_process";
import { describe, it, expect, afterEach } from "vitest";
import { directory as randomDirectory } from "tempy";
import { readFileSync } from "fs";
import fs from "fs/promises";
import path from "path";
import {
stopPkcDaemon,
startPkcDaemon,
ensureKuboNodeStopped,
waitForKuboReady,
type ManagedChildProcess
} from "../helpers/daemon-helpers.js";

// Ports unique to this file (avoid collisions with other test files and external processes).
const RPC_PORT = 9468;
const KUBO_PORT = 50121;
const GATEWAY_PORT = 6581;
const RPC_URL = `ws://localhost:${RPC_PORT}`;
const KUBO_URL = `http://0.0.0.0:${KUBO_PORT}/api/v0`;
const KUBO_API_URL = `http://localhost:${KUBO_PORT}/api/v0`;
const GATEWAY_URL = `http://0.0.0.0:${GATEWAY_PORT}`;

const CLI_VERSION = JSON.parse(readFileSync(path.join(process.cwd(), "package.json"), "utf-8")).version as string;

// How long the old daemon holds kubo alive after SIGINT. Long enough that, with the buggy
// (RPC-port-only) wait, the restart is guaranteed to land inside the window.
const KUBO_SHUTDOWN_DELAY_MS = 8000;

const runUpdateInstall = (env: Record<string, string>): Promise<{ exitCode: number | null; stdout: string; stderr: string }> => {
return new Promise((resolve, reject) => {
const proc = spawn("node", ["./bin/run", "update", "install", CLI_VERSION], {
stdio: ["ignore", "pipe", "pipe"],
env: { ...process.env, ...env }
});
let stdout = "";
let stderr = "";
proc.stdout.on("data", (d) => (stdout += d.toString()));
proc.stderr.on("data", (d) => (stderr += d.toString()));
proc.on("error", reject);
proc.on("close", (exitCode) => resolve({ exitCode, stdout, stderr }));
});
};

describe("bitsocial update install restart race (issue #70)", async () => {
let daemonA: ManagedChildProcess | undefined;
let restartedPidFile: string | undefined;

afterEach(async () => {
if (daemonA) await stopPkcDaemon(daemonA);
daemonA = undefined;
// The restarted daemon (B) is detached from update install — the shim recorded its PID.
if (restartedPidFile) {
const pids = (await fs.readFile(restartedPidFile, "utf-8").catch(() => ""))
.split("\n")
.map((s) => Number(s.trim()))
.filter((n) => Number.isInteger(n) && n > 0);
for (const pid of pids) {
try {
process.kill(pid, "SIGINT");
} catch {
/* already gone */
}
}
}
restartedPidFile = undefined;
await ensureKuboNodeStopped(KUBO_API_URL);
});

it.skipIf(process.platform === "win32")(
"does not restart the daemon until the old kubo has released the IPFS API port",
{ timeout: 120000 },
async () => {
// Isolate the daemon-state directory cross-platform (see file header).
const isolatedHome = randomDirectory();
const tmpBin = randomDirectory();
const markerFile = path.join(randomDirectory(), "restart-kubo-port.marker");
const pidFile = path.join(path.dirname(markerFile), "restart-daemon.pids");
restartedPidFile = pidFile;
const repoRoot = process.cwd();

// The check script the shim runs: record whether the kubo API port is still bound
// at the instant update install spawns the restart.
const checkScript = path.join(tmpBin, "record-kubo-port.cjs");
await fs.writeFile(
checkScript,
[
`const net = require("net");`,
`const fs = require("fs");`,
`const port = Number(new URL(process.env.KUBO_RPC_URL).port);`,
`const marker = process.env.PKC_CLI_TEST_RESTART_MARKER;`,
`const s = net.connect(port, "127.0.0.1");`,
`s.on("connect", () => { fs.appendFileSync(marker, "inuse\\n"); s.destroy(); process.exit(0); });`,
`s.on("error", () => { fs.appendFileSync(marker, "free\\n"); process.exit(0); });`,
`setTimeout(() => { try { s.destroy(); } catch {} process.exit(0); }, 3000);`
].join("\n")
);

// The `bitsocial` shim update install will spawn for the restart: record the kubo port
// state, record this process's PID (it becomes the real daemon via exec), then become
// the real daemon so it actually comes back up.
const shim = path.join(tmpBin, "bitsocial");
await fs.writeFile(
shim,
[
`#!/bin/sh`,
`node "${checkScript}"`,
`echo "$$" >> "${pidFile}"`,
`exec node "${repoRoot}/bin/run" "$@"`
].join("\n") + "\n"
);
await fs.chmod(shim, 0o755);

const sharedEnv = {
HOME: isolatedHome,
XDG_DATA_HOME: path.join(isolatedHome, ".local", "share"),
KUBO_RPC_URL: KUBO_URL,
IPFS_GATEWAY_URL: GATEWAY_URL
};

await ensureKuboNodeStopped(KUBO_API_URL);

// Start daemon A — a real daemon with a real kubo, writing its state file into the
// isolated home so update install discovers only this daemon.
daemonA = await startPkcDaemon(["--pkcRpcUrl", RPC_URL], {
...sharedEnv,
PKC_CLI_TEST_KUBO_SHUTDOWN_DELAY_MS: String(KUBO_SHUTDOWN_DELAY_MS)
});
expect(typeof daemonA.pid).toBe("number");
expect(await waitForKuboReady(KUBO_API_URL, 45000)).toBe(true);

// Run the real `bitsocial update install <currentVersion>`: same version => skips npm,
// but runs the full stop + _restartDaemons path. The shim (first on PATH) is what it
// spawns for the restart.
const result = await runUpdateInstall({
...sharedEnv,
PATH: `${tmpBin}:${process.env.PATH}`,
PKC_CLI_TEST_RESTART_MARKER: markerFile
});

// The command itself must have succeeded — a non-zero exit means the restart path
// failed even if the marker observations happen to look right.
expect(result.exitCode, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(0);

const marker = await fs.readFile(markerFile, "utf-8").catch(() => "");
const observations = marker.trim().split("\n").filter(Boolean);

// Sanity: exactly one daemon was discovered and restarted (the shim ran once). If this
// is >1 the state dir wasn't isolated and we picked up another test's daemon.
expect(observations.length, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(1);

// The fix's contract: when update install restarts the daemon, the old kubo must already
// be gone. With the bug (wait for RPC port only) this is "inuse" => the restarted daemon
// collides on the IPFS API port.
expect(observations[0]).toBe("free");
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
);
});
9 changes: 8 additions & 1 deletion test/common-utils/daemon-state.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,14 @@ describe("daemon-state", () => {
// and on the host PID 8 belongs to a kernel thread — an alive but unrelated process. The
// bare `process.kill(pid, 0)` liveness check passed, so `update install` SIGINT'd the
// unrelated process and restarted the daemon twice on the same port.
describe("getAliveDaemonStates — PID reused by an unrelated process", () => {
//
// Skipped on Windows: the PID-reuse scenario (issue #66) is a Docker-on-Linux problem
// (a container PID colliding with a host kernel thread), and the identity check that
// detects it relies on Unix process introspection (/proc, `ps`) plus Unix tooling
// (`sleep`, `bash`). On Windows the identity is undeterminable, so the code intentionally
// degrades to liveness-only — the conservative, safe fallback. There is nothing
// Windows-specific to assert here.
describe.skipIf(process.platform === "win32")("getAliveDaemonStates — PID reused by an unrelated process", () => {
const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states");
let child: ChildProcess | undefined;

Expand Down
48 changes: 35 additions & 13 deletions test/kubo/kuboRpcGateway.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -176,20 +176,42 @@ describe("kubo RPC + gateway integration", { timeout: 120_000 }, () => {
let ipfsRepoPath: string;

beforeAll(async () => {
const dataPath = tempDirectory();
ipfsRepoPath = path.join(dataPath, "ipfs-repo");
process.env.IPFS_PATH = ipfsRepoPath;

const apiPort = await getAvailablePort();
const gatewayPort = await getAvailablePort();
apiUrl = new URL(`http://127.0.0.1:${apiPort}`);
gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`);

await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl);
// kubo binds the API/gateway ports itself, so there's an unavoidable window between
// getAvailablePort() closing its probe socket and kubo binding the port where another
// process can claim it. The collision surfaces either as a pre-spawn port check throw
// ("...is already in use.") or as kubo losing the gateway bind on startup and exiting
// prematurely ("serveHTTPGateway: ... address already in use"). Both reject the awaited
// startKuboNode() promise with a message containing "already in use", so retry on that
// with a fresh repo + fresh ports. Anything else is a real failure and is re-thrown.
const maxAttempts = 4;
let lastError: unknown;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const dataPath = tempDirectory();
ipfsRepoPath = path.join(dataPath, "ipfs-repo");
process.env.IPFS_PATH = ipfsRepoPath;

const apiPort = await getAvailablePort();
const gatewayPort = await getAvailablePort();
apiUrl = new URL(`http://127.0.0.1:${apiPort}`);
gatewayUrl = new URL(`http://127.0.0.1:${gatewayPort}`);

await preInitKuboWithEphemeralSwarm(ipfsRepoPath, apiUrl, gatewayUrl);

kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath);

await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" }));
try {
kuboProcess = await startKuboNode(apiUrl, gatewayUrl, dataPath);
await waitForOkResponse(() => fetch(new URL("/api/v0/version", apiUrl), { method: "POST" }));
return;
} catch (error) {
lastError = error;
const message = error instanceof Error ? error.message : String(error);
// On rejection startKuboNode never leaks a live process (it either never spawned
// or already exited), so there's nothing to kill before the next attempt.
kuboProcess = undefined;
if (!/already in use/i.test(message) || attempt === maxAttempts) throw error;
}
}
// Unreachable: the loop either returns on success or throws. Keeps lastError referenced.
throw lastError;
});

afterAll(async () => {
Expand Down
Loading