From db9ddbda923506af2ef6b984b76f1cc65acd5722 Mon Sep 17 00:00:00 2001 From: Rhuan Barreto Date: Thu, 16 Apr 2026 08:04:22 +0200 Subject: [PATCH 1/2] perf(cli): remove 3s exit tail and trim startup overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows (and anywhere the runtime honours timer handles), every command that exits naturally through `main()` returning — i.e. every command except `check` and `upgrade` — used to linger ~3 seconds after its output appeared. Root causes + fixes below. ## Timer leaks (Phase 1) `flushTelemetry`, `gitCredentialFill`, the WSL fallback in `resolveCommand`, and the per-rule timeout in the check runner all race a spawn/shutdown against a `setTimeout`/`Bun.sleep` that's never cancelled when the race winner resolves. The stray timer keeps the event loop alive for its full duration. Each site now captures the timer id and clears it in a `.finally`. `flushSentry` wraps `Sentry.flush` with the same cancellable race pattern as defence-in-depth against leaks inside the SDK. `main()` now finishes with an explicit `process.exit(0)` after flushes complete so a future leaked handle in a third-party SDK can't regress this class of bug. ## Lazy SDK loading (Phase 2 #6) `posthog-node` and `@sentry/node-core/light` are now loaded via dynamic `import()` inside `initTelemetry` / `initSentry`. The `ARCHGATE_TELEMETRY=0` path skips the parse and init cost entirely. Both inits run concurrently via `Promise.all` in `cli.ts`. ## Startup trimming (Phase 2) - `installGit` short-circuits on `Bun.which("git")` before falling back to the async cross-env resolver (avoids a WSL subprocess on Windows in the 99% happy path). - `getRepoContext` now fires all four git probes in parallel instead of gating on `rev-parse --is-inside-work-tree`, saving one serial spawn on git repos. ## Check hot path (Phase 3) - `getGitTrackedFiles` is cached per project root — `resolveScopedFiles` used to spawn `git ls-files` once per ADR (16 spawns in this repo). - `ensureRulesShim` compares existing `rules.d.ts` content and skips the write when unchanged. - New `parseAllAdrs(projectRoot)` caches the readdir + parse of every ADR once per process. `loadRuleAdrs`, `loadAllAdrs` (in context.ts), and `adr list` now share that cache, so `review-context --run-checks` no longer reads the ADR directory twice. - `adr list` uses the shared cache — one readdir instead of two. - `buildSummary` is built once in `check.ts` and passed into every reporter + `getExitCode` instead of being recomputed three times. ## Telemetry payload (Phase 4) - `getCommonProperties` now splits into a cached "static" snapshot (platform, install method, CI, locale) and a fresh "dynamic" overlay (project context, repo snapshot). Cuts redundant `getPlatformInfo` / `detectInstallMethod` / `Intl.DateTimeFormat` calls when multiple events are emitted per command. - The opportunistic update check at startup now uses a 5s fetch timeout (down from 15s) so a slow network never extends exit time. The explicit `archgate upgrade` path keeps the 15s default. ## Measured impact (Windows, this repo) | Command | Before | After | |------------------|---------|---------| | adr list | 3.60 s | 0.82 s | | adr show | 3.60 s | 0.72 s | | doctor | 3.90 s | 1.06 s | | telemetry status | 3.70 s | 0.72 s | | check | 0.88 s | 0.85 s | | adr list (tel off)| 0.59 s | 0.42 s | | doctor (tel off) | 3.50 s | 0.80 s | ## Not changed The `await initTelemetry()` in `main()` — PR #211 added it so `command_executed` / `command_completed` carry `repo_id`. Keeping the await; lazy-loading plus parallel init keeps the added startup cost bounded. ## Validation `bun run validate` passes (lint + typecheck + format + 662 tests + 22/22 ADR rules + build check). --- src/cli.ts | 16 ++++-- src/commands/adr/list.ts | 31 +++-------- src/commands/check.ts | 14 +++-- src/engine/context.ts | 37 +++--------- src/engine/git-files.ts | 48 +++++++++++----- src/engine/loader.ts | 96 ++++++++++++++++++++++---------- src/engine/reporter.ts | 40 +++++++++---- src/engine/runner.ts | 9 ++- src/helpers/binary-upgrade.ts | 12 +++- src/helpers/credential-store.ts | 17 ++++-- src/helpers/git.ts | 19 ++++++- src/helpers/platform.ts | 9 ++- src/helpers/repo.ts | 56 ++++++++++--------- src/helpers/rules-shim.ts | 27 +++++++-- src/helpers/sentry.ts | 33 ++++++++--- src/helpers/telemetry.ts | 99 +++++++++++++++++++++++---------- src/helpers/update-check.ts | 5 +- 17 files changed, 376 insertions(+), 192 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 3fc387a..d0a053d 100755 --- a/src/cli.ts +++ b/src/cli.ts @@ -63,12 +63,15 @@ async function main() { await installGit(); // Initialize error tracking and telemetry (no-ops if opted out). + // + // Both SDKs are lazy-loaded via dynamic `import()` inside these functions, + // so the `ARCHGATE_TELEMETRY=0` path skips the parse/init cost entirely. + // // Await telemetry so the repo context is resolved before the preAction // hook fires `command_executed` — otherwise that event always lands - // without `repo_id`. The repo lookup is a few cached git subprocesses - // (~5–20ms on a cold run) and runs exactly once per invocation. - initSentry(); - await initTelemetry(); + // without `repo_id` (see PR #211). The two init calls run concurrently, + // so the wall-clock cost is bounded by whichever is slowest. + await Promise.all([initSentry(), initTelemetry()]); const logLevelOption = new Option("--log-level ", "Set log verbosity") .choices(["error", "warn", "info", "debug"] as const) @@ -137,6 +140,11 @@ async function main() { // Flush telemetry and error tracking before exit await Promise.all([flushTelemetry(), flushSentry()]); + + // Belt-and-braces: force exit so any stray handle left by a third-party + // SDK (posthog-node, @sentry/node-core, etc.) can't linger and make the + // CLI feel laggy. All flushes above have already completed. + process.exit(0); } /** diff --git a/src/commands/adr/list.ts b/src/commands/adr/list.ts index a4ed3b0..7980cca 100644 --- a/src/commands/adr/list.ts +++ b/src/commands/adr/list.ts @@ -1,30 +1,14 @@ -import { existsSync, readdirSync } from "node:fs"; -import { join } from "node:path"; +import { existsSync } from "node:fs"; import { styleText } from "node:util"; import type { Command } from "@commander-js/extra-typings"; -import { parseAdr, type AdrDocument } from "../../formats/adr"; +import { parseAllAdrs } from "../../engine/loader"; import { exitWith } from "../../helpers/exit"; import { logError } from "../../helpers/log"; import { formatJSON, isAgentContext } from "../../helpers/output"; import { findProjectRoot, projectPaths } from "../../helpers/paths"; -async function loadAdrs(adrsDir: string): Promise { - const files = readdirSync(adrsDir).filter((f) => f.endsWith(".md")); - const results = await Promise.all( - files.map(async (file) => { - try { - const content = await Bun.file(join(adrsDir, file)).text(); - return parseAdr(content, file); - } catch { - return null; - } - }) - ); - return results.filter((r): r is AdrDocument => r !== null); -} - export function registerAdrListCommand(adr: Command) { adr .command("list") @@ -47,17 +31,16 @@ export function registerAdrListCommand(adr: Command) { return; } - const files = readdirSync(paths.adrsDir).filter((f) => - f.endsWith(".md") - ); + // parseAllAdrs is cached per-process and shared with the check / + // review-context engines, so we don't need a separate readdir pass + // to bail early on empty dirs. + const adrs = (await parseAllAdrs(projectRoot)).map((e) => e.adr); - if (files.length === 0) { + if (adrs.length === 0) { console.log("No ADRs found."); return; } - const adrs = await loadAdrs(paths.adrsDir); - // Filter by domain if specified const filtered = options.domain ? adrs.filter((a) => a.frontmatter.domain === options.domain) diff --git a/src/commands/check.ts b/src/commands/check.ts index 89a05f7..8ef8459 100644 --- a/src/commands/check.ts +++ b/src/commands/check.ts @@ -101,16 +101,20 @@ export function registerCheckCommand(program: Command) { // Determine output format for telemetry const outputFormat = opts.ci ? "ci" : useJson ? "json" : "console"; + // Build the summary once and share it with the reporters, telemetry, + // and exit-code resolver. Previously each of those built its own + // summary — 3 walks over the same result set. + const summary = buildSummary(result); + if (opts.ci) { - reportCI(result); + reportCI(result, summary); } else if (useJson) { - reportJSON(result, opts.json ? true : undefined); + reportJSON(result, opts.json ? true : undefined, summary); } else { - reportConsole(result, opts.verbose ?? false); + reportConsole(result, opts.verbose ?? false, summary); } // Track aggregate check results (no file paths or violation content) - const summary = buildSummary(result); trackCheckResult({ total_rules: summary.total, passed: summary.passed, @@ -128,7 +132,7 @@ export function registerCheckCommand(program: Command) { check_duration_ms: Math.round(result.totalDurationMs), }); - const exitCode = getExitCode(result); + const exitCode = getExitCode(result, summary); // Only 0, 1, and 2 are emitted by getExitCode() await exitWith(exitCode as 0 | 1 | 2); }); diff --git a/src/engine/context.ts b/src/engine/context.ts index b831fa9..596c90f 100644 --- a/src/engine/context.ts +++ b/src/engine/context.ts @@ -1,11 +1,6 @@ -import { readdirSync } from "node:fs"; -import { join } from "node:path"; - -import { parseAdr } from "../formats/adr"; import type { AdrDocument, AdrDomain } from "../formats/adr"; -import { projectPaths } from "../helpers/paths"; import { getChangedFiles, getStagedFiles } from "./git-files"; -import { loadRuleAdrs } from "./loader"; +import { loadRuleAdrs, parseAllAdrs } from "./loader"; import type { ReportSummary } from "./reporter"; import { buildSummary } from "./reporter"; import { runChecks } from "./runner"; @@ -171,30 +166,14 @@ export function matchFilesToAdrs( return results.sort((a, b) => a.domain.localeCompare(b.domain)); } -/** Load all ADR documents (not just those with rules) from the project. */ +/** + * Load all ADR documents (not just those with rules) from the project. + * Shares the per-process parse cache with `loadRuleAdrs` so + * `review-context --run-checks` only reads the ADR directory once. + */ async function loadAllAdrs(projectRoot: string): Promise { - const pp = projectPaths(projectRoot); - - let files: string[]; - try { - files = readdirSync(pp.adrsDir).filter((f) => f.endsWith(".md")); - } catch { - return []; - } - - const results = await Promise.all( - files.map(async (file) => { - try { - const filePath = join(pp.adrsDir, file); - const content = await Bun.file(filePath).text(); - return parseAdr(content, filePath); - } catch { - return null; - } - }) - ); - - return results.filter((adr): adr is AdrDocument => adr !== null); + const parsed = await parseAllAdrs(projectRoot); + return parsed.map((e) => e.adr); } const EMPTY_SUMMARY: ReportSummary = { diff --git a/src/engine/git-files.ts b/src/engine/git-files.ts index 7a30072..a3d50fb 100644 --- a/src/engine/git-files.ts +++ b/src/engine/git-files.ts @@ -21,22 +21,44 @@ async function runGit(args: string[], cwd: string): Promise { return text; } +/** + * Cache of tracked-files lookups per project root. `archgate check` calls + * `resolveScopedFiles` once per ADR — without this cache that's N spawns of + * `git ls-files` (one per ADR) instead of 1, which adds ~25ms × N on Windows. + * The in-memory lifetime matches the process; file changes during a single + * CLI invocation are not expected. + */ +const trackedFilesCache = new Map | null>>(); + /** Get all git-tracked (non-ignored) files in the project. */ -export async function getGitTrackedFiles( +export function getGitTrackedFiles( projectRoot: string ): Promise | null> { - try { - const result = await runGit( - ["ls-files", "--cached", "--others", "--exclude-standard"], - projectRoot - ); - const files = new Set(result.trim().split("\n").filter(Boolean)); - logDebug("Git tracked files:", files.size); - return files; - } catch { - logDebug("Git tracked files lookup failed (not a git repo?)"); - return null; - } + const cached = trackedFilesCache.get(projectRoot); + if (cached) return cached; + + const promise = (async () => { + try { + const result = await runGit( + ["ls-files", "--cached", "--others", "--exclude-standard"], + projectRoot + ); + const files = new Set(result.trim().split("\n").filter(Boolean)); + logDebug("Git tracked files:", files.size); + return files; + } catch { + logDebug("Git tracked files lookup failed (not a git repo?)"); + return null; + } + })(); + + trackedFilesCache.set(projectRoot, promise); + return promise; +} + +/** Reset the tracked-files cache. For testing only. */ +export function _resetGitFilesCache(): void { + trackedFilesCache.clear(); } /** Resolve scoped files for an ADR based on its files globs. Respects .gitignore. */ diff --git a/src/engine/loader.ts b/src/engine/loader.ts index 0310a0b..c61863c 100644 --- a/src/engine/loader.ts +++ b/src/engine/loader.ts @@ -137,6 +137,67 @@ function checkRuleSyntax(source: string): SyntaxViolation[] { return violations; } +export interface ParsedAdrEntry { + file: string; + adr: AdrDocument; +} + +/** + * Process-level cache of `readdir + read + parse` for each project root. + * `archgate review-context --run-checks` used to parse every ADR twice + * (once for briefings, once for rule loading); the cache lets both paths + * share the I/O. `archgate check` + `adr list` benefit too. + * + * Cache lifetime is per-process — consistent with other per-invocation + * caches in this codebase (git ls-files, repo context, install method). + */ +const parsedAdrsCache = new Map>(); + +/** Reset the parsed-ADRs cache. For testing only. */ +export function _resetAdrParseCache(): void { + parsedAdrsCache.clear(); +} + +/** + * Read and parse every ADR markdown file in the project, caching the result + * per-process. Returns entries in directory order. Unparseable files are + * silently skipped (logged at debug level). + */ +export function parseAllAdrs(projectRoot: string): Promise { + const cached = parsedAdrsCache.get(projectRoot); + if (cached) return cached; + + const pp = projectPaths(projectRoot); + const adrsDir = pp.adrsDir; + + const promise = (async () => { + let files: string[]; + try { + files = readdirSync(adrsDir).filter((f) => f.endsWith(".md")); + } catch { + return []; + } + + const parsed = await Promise.all( + files.map(async (file): Promise => { + const filePath = join(adrsDir, file); + try { + const content = await Bun.file(filePath).text(); + return { file, adr: parseAdr(content, filePath) }; + } catch (err) { + logDebug(`Skipping unparseable ADR: ${filePath}`, err); + return null; + } + }) + ); + + return parsed.filter((e): e is ParsedAdrEntry => e !== null); + })(); + + parsedAdrsCache.set(projectRoot, promise); + return promise; +} + /** * Discover ADRs with rules: true and dynamically import their companion .rules.ts files. */ @@ -152,36 +213,15 @@ export async function loadRuleAdrs( const adrsDir = pp.adrsDir; - let files: string[]; - try { - files = readdirSync(adrsDir).filter((f) => f.endsWith(".md")); - } catch { - return []; - } - - // Phase 1: Read and parse all ADR files in parallel - const parsedAdrs = await Promise.all( - files.map(async (file) => { - const filePath = join(adrsDir, file); - try { - const content = await Bun.file(filePath).text(); - return { file, adr: parseAdr(content, filePath) }; - } catch (err) { - logDebug(`Skipping unparseable ADR: ${filePath}`, err); - return null; - } - }) - ); + // Phase 1: Read and parse all ADR files in parallel (cached per process) + const parsedAdrs = await parseAllAdrs(projectRoot); // Filter to ADRs that have rules enabled - const ruleAdrs = parsedAdrs.filter( - (entry): entry is NonNullable => { - if (entry === null) return false; - if (!entry.adr.frontmatter.rules) return false; - if (filterAdrId && entry.adr.frontmatter.id !== filterAdrId) return false; - return true; - } - ); + const ruleAdrs = parsedAdrs.filter((entry) => { + if (!entry.adr.frontmatter.rules) return false; + if (filterAdrId && entry.adr.frontmatter.id !== filterAdrId) return false; + return true; + }); // Phase 2: Verify companion files exist and import rule sets in parallel const ruleResults = await Promise.all( diff --git a/src/engine/reporter.ts b/src/engine/reporter.ts index 3e88b93..26f8370 100644 --- a/src/engine/reporter.ts +++ b/src/engine/reporter.ts @@ -120,10 +120,14 @@ export function buildSummary( /** * Output results in console format with colors. + * Accepts an optional pre-built summary to avoid walking the results twice + * when the caller already computed one (e.g. for telemetry). */ -export function reportConsole(result: CheckResult, verbose: boolean): void { - const summary = buildSummary(result); - +export function reportConsole( + result: CheckResult, + verbose: boolean, + summary: ReportSummary = buildSummary(result) +): void { for (const r of summary.results) { const prefix = `${r.adrId}/${r.ruleId}`; @@ -204,17 +208,21 @@ export function reportConsole(result: CheckResult, verbose: boolean): void { * @param forcePretty - When true, always pretty-print (e.g., explicit --json flag). * When omitted, format is auto-detected based on TTY/CI context. */ -export function reportJSON(result: CheckResult, forcePretty?: boolean): void { - const summary = buildSummary(result); +export function reportJSON( + result: CheckResult, + forcePretty?: boolean, + summary: ReportSummary = buildSummary(result) +): void { console.log(formatJSON(summary, forcePretty)); } /** * Output results as GitHub Actions annotations. */ -export function reportCI(result: CheckResult): void { - const summary = buildSummary(result); - +export function reportCI( + result: CheckResult, + summary: ReportSummary = buildSummary(result) +): void { for (const r of summary.results) { if (r.error) { console.log( @@ -246,9 +254,21 @@ export function reportCI(result: CheckResult): void { /** * Determine the exit code from check results. - * 0 = pass, 1 = violations, 2 = rule execution errors + * 0 = pass, 1 = violations, 2 = rule execution errors. + * + * Prefer calling this with a pre-built summary — `check.ts` already computes + * one for telemetry, and walking `result.results` a second time here is pure + * duplication. */ -export function getExitCode(result: CheckResult): number { +export function getExitCode( + result: CheckResult, + summary?: ReportSummary +): number { + if (summary) { + if (summary.ruleErrors > 0) return 2; + if (summary.failed > 0) return 1; + return 0; + } const hasErrors = result.results.some((r) => r.error); if (hasErrors) return 2; diff --git a/src/engine/runner.ts b/src/engine/runner.ts index 09e8eeb..cb9f90c 100644 --- a/src/engine/runner.ts +++ b/src/engine/runner.ts @@ -239,11 +239,14 @@ export async function runChecks( ); try { + // Cancel the timeout when the rule resolves first — otherwise the + // timer keeps the event loop alive even after checks complete. + let timer: ReturnType | undefined; // oxlint-disable-next-line no-await-in-loop -- rules within an ADR run sequentially await Promise.race([ ruleConfig.check(ctx), new Promise((_, reject) => { - setTimeout( + timer = setTimeout( () => reject( new Error( @@ -253,7 +256,9 @@ export async function runChecks( RULE_TIMEOUT_MS ); }), - ]); + ]).finally(() => { + if (timer) clearTimeout(timer); + }); adrRuleResults.push({ ruleId, diff --git a/src/helpers/binary-upgrade.ts b/src/helpers/binary-upgrade.ts index 50d6cfb..7d63a29 100644 --- a/src/helpers/binary-upgrade.ts +++ b/src/helpers/binary-upgrade.ts @@ -66,12 +66,20 @@ const GITHUB_RELEASES_API = `https://api.github.com/repos/${GITHUB_REPO}/release /** * Fetch the latest version tag from GitHub Releases. * Returns the tag (e.g. "v0.13.1") or null on failure. + * + * @param timeoutMs Request timeout. Use a short value (e.g. 5s) for the + * opportunistic background update check at CLI startup so + * a slow network never delays the user's command. The + * longer default (15s) is reserved for the explicit + * `archgate upgrade` path where the user is waiting for it. */ -export async function fetchLatestGitHubVersion(): Promise { +export async function fetchLatestGitHubVersion( + timeoutMs = 15_000 +): Promise { logDebug("Fetching latest release from:", GITHUB_RELEASES_API); const response = await fetch(GITHUB_RELEASES_API, { headers: { "User-Agent": "archgate-cli" }, - signal: AbortSignal.timeout(15_000), + signal: AbortSignal.timeout(timeoutMs), }); if (!response.ok) { diff --git a/src/helpers/credential-store.ts b/src/helpers/credential-store.ts index 956697d..1b81e1d 100644 --- a/src/helpers/credential-store.ts +++ b/src/helpers/credential-store.ts @@ -84,17 +84,26 @@ async function gitCredentialFill(): Promise<{ env: gitCredentialEnv(), }); + // The timeout MUST be cancelled when the spawn wins the race — + // `Bun.sleep` / `setTimeout` both keep the event loop alive for + // their full duration, which used to add 3s of latency to + // commands that call `loadCredentials()` (e.g. `archgate doctor`). + let timer: ReturnType | undefined; const result = await Promise.race([ (async () => { const stdout = await new Response(proc.stdout).text(); const exitCode = await proc.exited; return { stdout, exitCode }; })(), - Bun.sleep(CREDENTIAL_TIMEOUT_MS).then(() => { - proc.kill(); - return null; + new Promise((resolve) => { + timer = setTimeout(() => { + proc.kill(); + resolve(null); + }, CREDENTIAL_TIMEOUT_MS); }), - ]); + ]).finally(() => { + if (timer) clearTimeout(timer); + }); if (!result || result.exitCode !== 0) return null; diff --git a/src/helpers/git.ts b/src/helpers/git.ts index 273491a..10b7c20 100644 --- a/src/helpers/git.ts +++ b/src/helpers/git.ts @@ -1,11 +1,28 @@ import { logDebug, logInfo } from "./log"; import { isWindows, isMacOS, resolveCommand } from "./platform"; +/** + * Ensure git is installed, installing via brew/apt when missing on Unix. + * + * Fast path: `Bun.which("git")` is a synchronous PATH lookup that short-circuits + * when git is on PATH — which is the 99%+ case for everyone except first-run + * users without git installed. This lets us skip the `resolveCommand` async + * ceremony (and its WSL fallback subprocess on Windows) from the startup path. + */ export async function installGit() { - if (await resolveCommand("git")) { + // Fast path: git on PATH — no subprocess, no await, no WSL fallback. + if (Bun.which("git")) { logDebug("Git is already installed"); return; } + + // Slow path: git wasn't found synchronously. Fall back to the full cross-env + // resolver (handles WSL `.exe` lookups and WSL-from-Windows availability). + if (await resolveCommand("git")) { + logDebug("Git is already installed (cross-env)"); + return; + } + logInfo("Git is not installed. Installing..."); if (isWindows()) { throw new Error( diff --git a/src/helpers/platform.ts b/src/helpers/platform.ts index db2fa5d..5a3734f 100644 --- a/src/helpers/platform.ts +++ b/src/helpers/platform.ts @@ -217,10 +217,15 @@ export async function resolveCommand(name: string): Promise { stdout: "pipe", stderr: "pipe", }); + // Timer MUST be cleared when the spawn wins — a dangling + // `setTimeout` keeps the event loop alive for its full duration. + let timer: ReturnType | undefined; const timeout = new Promise<"timeout">((resolve) => { - setTimeout(() => resolve("timeout"), 3000); + timer = setTimeout(() => resolve("timeout"), 3000); + }); + const result = await Promise.race([proc.exited, timeout]).finally(() => { + if (timer) clearTimeout(timer); }); - const result = await Promise.race([proc.exited, timeout]); if (result === "timeout") { proc.kill(); return null; diff --git a/src/helpers/repo.ts b/src/helpers/repo.ts index 51cf769..d5188aa 100644 --- a/src/helpers/repo.ts +++ b/src/helpers/repo.ts @@ -99,19 +99,25 @@ export async function getRepoContext(): Promise { if (cached) return cached; const cwd = process.cwd(); - const isGit = await runGitCheck(["rev-parse", "--is-inside-work-tree"], cwd); + + // Fire all four git probes concurrently. On Windows each subprocess costs + // ~25ms, so running them in parallel instead of gating on + // `rev-parse --is-inside-work-tree` saves one serial spawn on the happy + // path. In a non-git directory the extras exit non-zero quickly and their + // results are discarded. + const [isGit, remoteUrl, symRef, currentBranch] = await Promise.all([ + runGitCheck(["rev-parse", "--is-inside-work-tree"], cwd), + runGitCapture(["config", "--get", "remote.origin.url"], cwd), + runGitCapture(["symbolic-ref", "--short", "refs/remotes/origin/HEAD"], cwd), + runGitCapture(["rev-parse", "--abbrev-ref", "HEAD"], cwd), + ]); + if (!isGit) { cached = emptyContext(false); return cached; } - // Run the remaining probes concurrently — they're independent and the - // dominant cost on Windows is serial subprocess spawn (~25ms each), not - // the git work itself. - const [remoteUrl, defaultBranch] = await Promise.all([ - runGitCapture(["config", "--get", "remote.origin.url"], cwd), - resolveDefaultBranch(cwd), - ]); + const defaultBranch = pickDefaultBranch(symRef, currentBranch); if (!remoteUrl) { cached = { @@ -139,6 +145,22 @@ export async function getRepoContext(): Promise { return cached; } +/** + * Pick the most informative default-branch signal: prefer the remote HEAD + * symbolic ref (e.g. `origin/main` → `main`), fall back to the currently + * checked-out branch. + */ +function pickDefaultBranch( + symRef: string | null, + currentBranch: string | null +): string | null { + if (symRef) { + const slash = symRef.indexOf("/"); + return slash >= 0 ? symRef.slice(slash + 1) : symRef; + } + return currentBranch; +} + /** * Should the CLI include owner / name / full remote URL in the * `project_initialized` event? @@ -302,24 +324,6 @@ async function runGitCheck(args: string[], cwd: string): Promise { } } -async function resolveDefaultBranch(cwd: string): Promise { - // Fire the preferred lookup (remote HEAD symbolic ref) and the fallback - // (currently-checked-out branch) in parallel. The fallback subprocess is - // cheap and lets us hide its spawn cost behind the other concurrent git - // calls — serial, it was the tail on the "no remote HEAD" path. - const [symRef, currentBranch] = await Promise.all([ - runGitCapture(["symbolic-ref", "--short", "refs/remotes/origin/HEAD"], cwd), - runGitCapture(["rev-parse", "--abbrev-ref", "HEAD"], cwd), - ]); - if (symRef) { - const slash = symRef.indexOf("/"); - return slash >= 0 ? symRef.slice(slash + 1) : symRef; - } - // Fallback: not strictly the "default" branch, but better than null for - // a single-user local repo. - return currentBranch; -} - // --------------------------------------------------------------------------- // Internals // --------------------------------------------------------------------------- diff --git a/src/helpers/rules-shim.ts b/src/helpers/rules-shim.ts index fb50b43..2deb026 100644 --- a/src/helpers/rules-shim.ts +++ b/src/helpers/rules-shim.ts @@ -109,8 +109,8 @@ export default { } /** - * Write rules.d.ts to the .archgate/ directory. - * Always overwrites — idempotent. + * Write rules.d.ts to the .archgate/ directory, unconditionally. + * Use this when you want to guarantee a fresh write (e.g. init). */ export async function writeRulesShim(projectRoot: string): Promise { const dtsPath = projectPath(projectRoot, "rules.d.ts"); @@ -119,7 +119,24 @@ export async function writeRulesShim(projectRoot: string): Promise { } /** - * Alias for writeRulesShim — always overwrites to ensure freshness - * when the CLI version changes. + * Ensure rules.d.ts exists and is up-to-date. Skips the disk write when the + * on-disk content already matches — `archgate check` calls this every run, + * and the content only changes when the CLI version changes. */ -export const ensureRulesShim = writeRulesShim; +export async function ensureRulesShim(projectRoot: string): Promise { + const dtsPath = projectPath(projectRoot, "rules.d.ts"); + const expected = generateRulesDts(); + + try { + const existing = await Bun.file(dtsPath).text(); + if (existing === expected) { + logDebug("Rules type definitions up-to-date:", dtsPath); + return; + } + } catch { + // File missing or unreadable — fall through to the write. + } + + await Bun.write(dtsPath, expected); + logDebug("Rules type definitions written:", dtsPath); +} diff --git a/src/helpers/sentry.ts b/src/helpers/sentry.ts index 627203b..eabba7e 100644 --- a/src/helpers/sentry.ts +++ b/src/helpers/sentry.ts @@ -13,7 +13,7 @@ * are wrapped to never affect CLI behavior or exit codes. */ -import * as Sentry from "@sentry/node-core/light"; +import type * as SentryNs from "@sentry/node-core/light"; import packageJson from "../../package.json"; import { detectInstallMethod } from "./install-info"; @@ -51,6 +51,7 @@ function getArchgatePath(): string { // --------------------------------------------------------------------------- let initialized = false; +let Sentry: typeof SentryNs | null = null; // --------------------------------------------------------------------------- // Public API @@ -59,8 +60,11 @@ let initialized = false; /** * Initialize Sentry error tracking. Call once at CLI startup. * No-op if telemetry is disabled. + * + * Returns a promise so the Sentry SDK can be lazy-loaded — the ~600KB module + * is skipped entirely when `ARCHGATE_TELEMETRY=0`, shaving cold-start time. */ -export function initSentry(): void { +export async function initSentry(): Promise { if (!isTelemetryEnabled()) { logDebug("Sentry disabled — telemetry is off"); return; @@ -70,6 +74,8 @@ export function initSentry(): void { const { runtime } = getPlatformInfo(); try { + // Lazy-load so the disabled-telemetry path never pays the parse cost. + Sentry = await import("@sentry/node-core/light"); Sentry.init({ dsn: SENTRY_DSN, release: cliVersion, @@ -146,7 +152,7 @@ export function addBreadcrumb( data?: Record, level: "debug" | "info" | "warning" | "error" = "info" ): void { - if (!initialized) return; + if (!initialized || !Sentry) return; try { Sentry.addBreadcrumb({ @@ -171,7 +177,7 @@ export function captureException( error: unknown, context?: Record ): void { - if (!initialized) return; + if (!initialized || !Sentry) return; try { Sentry.captureException(error, { contexts: { cli: context } }); @@ -187,10 +193,22 @@ export function captureException( * @param timeoutMs Maximum time to wait for flush (default: 2000ms) */ export async function flushSentry(timeoutMs = 2000): Promise { - if (!initialized) return; + if (!initialized || !Sentry) return; try { - await Sentry.flush(timeoutMs); + // Defense-in-depth: race Sentry.flush against our own cancellable timer + // so a stuck flush can never keep the event loop alive past `timeoutMs`. + // Sentry manages its own internal timers, but this guards against any + // leaked socket / handle in the SDK. + let timer: ReturnType | undefined; + await Promise.race([ + Sentry.flush(timeoutMs), + new Promise((resolve) => { + timer = setTimeout(resolve, timeoutMs + 100); + }), + ]).finally(() => { + if (timer) clearTimeout(timer); + }); logDebug("Sentry flushed"); } catch { logDebug("Sentry flush failed (silently ignored)"); @@ -203,8 +221,9 @@ export async function flushSentry(timeoutMs = 2000): Promise { /** Reset Sentry state. For testing only. */ export function _resetSentry(): void { - if (initialized) { + if (initialized && Sentry) { Sentry.close(); } initialized = false; + Sentry = null; } diff --git a/src/helpers/telemetry.ts b/src/helpers/telemetry.ts index 64220e1..2c2acfd 100644 --- a/src/helpers/telemetry.ts +++ b/src/helpers/telemetry.ts @@ -14,7 +14,7 @@ import { basename } from "node:path"; -import { PostHog } from "posthog-node"; +import type { PostHog } from "posthog-node"; import packageJson from "../../package.json"; import { detectInstallMethod, getProjectContext } from "./install-info"; @@ -92,15 +92,28 @@ function detectLocale(): string | null { } // --------------------------------------------------------------------------- -// Shared properties (recomputed per event for freshness) +// Shared properties // --------------------------------------------------------------------------- -function getCommonProperties(): Record { - const { runtime, isWSL } = getPlatformInfo(); - const ctx = getProjectContext(); - const repo = repoContextSnapshot; +/** + * Cache of the slow-changing portion of the common event properties. Platform + * detection, install-method detection, CI detection, and locale resolution + * are all effectively constant for the lifetime of a CLI invocation, so we + * compute them once and reuse across events. + * + * The project context and repo snapshot are intentionally NOT cached here: + * - project context changes when `archgate init` creates the directory + * mid-command, so we always re-read it. + * - repo context lives in `repoContextSnapshot` which is written by + * `initTelemetry`. + */ +let staticPropertiesSnapshot: Record | null = null; - return { +function getStaticProperties(): Record { + if (staticPropertiesSnapshot) return staticPropertiesSnapshot; + + const { runtime, isWSL } = getPlatformInfo(); + staticPropertiesSnapshot = { // --- CLI / runtime --- cli_version: packageJson.version, os: runtime, @@ -114,7 +127,21 @@ function getCommonProperties(): Record { is_wsl: isWSL, shell: detectShell(), locale: detectLocale(), - // --- Project --- + // --- Geo privacy --- + // Signal PostHog to resolve geo then discard the IP + $ip: null, + }; + return staticPropertiesSnapshot; +} + +function getCommonProperties(): Record { + const ctx = getProjectContext(); + const repo = repoContextSnapshot; + + return { + ...getStaticProperties(), + // --- Project (re-read every event; `archgate init` can mutate this + // mid-invocation, and the read is a single readdirSync) --- has_project: ctx.hasProject, adr_count: ctx.adrCount, adr_with_rules_count: ctx.adrWithRulesCount, @@ -124,9 +151,6 @@ function getCommonProperties(): Record { repo_host: repo?.host ?? null, repo_id: repo?.repoId ?? null, git_default_branch: repo?.defaultBranch ?? null, - // --- Geo privacy --- - // Signal PostHog to resolve geo then discard the IP - $ip: null, }; } @@ -144,15 +168,29 @@ function getCommonProperties(): Record { * event ships without repo identity. The repo lookup runs a handful of git * subprocesses (cached per-process), so the added startup latency is small. */ -export function initTelemetry(): Promise { +export async function initTelemetry(): Promise { if (!isTelemetryEnabled()) { logDebug("Telemetry disabled — skipping init"); - return Promise.resolve(); + return; } distinctId = getInstallId(); + // Kick off the repo-context resolution in parallel with the dynamic SDK + // import. The caller awaits this whole function before `command_executed` + // is emitted (see PR #211) so the snapshot lands before the first event. + const repoContextPromise = getRepoContext() + .then((ctx) => { + repoContextSnapshot = ctx; + }) + .catch((err) => { + logDebug("Repo context resolution failed (ignored):", String(err)); + }); + try { + // Lazy-load the PostHog SDK so the `ARCHGATE_TELEMETRY=0` path never pays + // the module-parse cost (noticeable on cold starts / WSL). + const { PostHog } = await import("posthog-node"); client = new PostHog(POSTHOG_API_KEY, { host: POSTHOG_HOST, // Disable polling for feature flags — we don't use them in the CLI @@ -167,15 +205,7 @@ export function initTelemetry(): Promise { logDebug("Telemetry init failed (silently ignored)"); } - // Resolve the repo context asynchronously. The result lands in module state - // so subsequent events pick it up without blocking the CLI startup path. - return getRepoContext() - .then((ctx) => { - repoContextSnapshot = ctx; - }) - .catch((err) => { - logDebug("Repo context resolution failed (ignored):", String(err)); - }); + await repoContextPromise; } /** @@ -375,13 +405,23 @@ export async function flushTelemetry(timeoutMs = 3000): Promise { try { logDebug("Flushing telemetry events"); - // Race shutdown against a timeout to prevent hanging on exit - await Promise.race([ - client.shutdown(), - new Promise((resolve) => { - setTimeout(resolve, timeoutMs); - }), - ]); + // Race shutdown against a timeout to prevent hanging on exit. + // + // The timeout MUST be cancelled when shutdown wins — a dangling + // `setTimeout` keeps the Bun/Node event loop alive for its full + // duration, which used to add 3s of latency to every command that + // exited via `main()` returning naturally (instead of `process.exit`). + let timer: ReturnType | undefined; + try { + await Promise.race([ + client.shutdown(), + new Promise((resolve) => { + timer = setTimeout(resolve, timeoutMs); + }), + ]); + } finally { + if (timer) clearTimeout(timer); + } logDebug("Telemetry flushed"); } catch { // Silently ignore — telemetry must never affect CLI behavior @@ -402,6 +442,7 @@ export function _resetTelemetry(): void { initialized = false; distinctId = ""; repoContextSnapshot = null; + staticPropertiesSnapshot = null; } /** Get the PostHog client instance. For testing only. */ diff --git a/src/helpers/update-check.ts b/src/helpers/update-check.ts index 8fe1cdb..bcad81a 100644 --- a/src/helpers/update-check.ts +++ b/src/helpers/update-check.ts @@ -33,7 +33,10 @@ export async function checkForUpdatesIfNeeded( logDebug("Checking for updates..."); - const tag = await fetchLatestGitHubVersion(); + // Use a tight 5s timeout for the opportunistic background check so a + // slow network never extends exit time. The full 15s default is + // reserved for the explicit `archgate upgrade` path. + const tag = await fetchLatestGitHubVersion(5_000); if (!tag) { logDebug("Update check failed — could not fetch latest GitHub release"); return null; From 37bc0a9389332e01196221c9cf87d5622e199858 Mon Sep 17 00:00:00 2001 From: Rhuan Barreto Date: Thu, 16 Apr 2026 08:11:07 +0200 Subject: [PATCH 2/2] test(perf): regression guard for the 3s exit tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spawns the real CLI via `bun run src/cli.ts --version` / `--help`, asserts the median of 3 runs stays under 4s. The historical regression (un-cancelled `setTimeout` / `Bun.sleep` in the telemetry / Sentry / credential / WSL flush paths) pushed these commands to ~3.5s on Windows; normal runs sit well under 2s even on slow CI. `NODE_ENV=test` suppresses real event delivery but leaves the telemetry + Sentry SDK init/flush path active, which is where the regression lives. `ARCHGATE_TELEMETRY` is intentionally NOT set to 0 so the test exercises the enabled code path. Threshold chosen to catch the regression (always ≥3000ms) with enough headroom that slow runners don't flake. --- tests/integration/cli-perf.test.ts | 125 +++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tests/integration/cli-perf.test.ts diff --git a/tests/integration/cli-perf.test.ts b/tests/integration/cli-perf.test.ts new file mode 100644 index 0000000..22d2b1b --- /dev/null +++ b/tests/integration/cli-perf.test.ts @@ -0,0 +1,125 @@ +/** + * Performance regression tests — guard against the "un-cancelled timer" + * class of bugs that added a 3-second tail to every command that exits + * naturally through `main()` returning. + * + * See PR #213 for the full list of call sites fixed. The specific + * regression this file protects against is any leaked `setTimeout` / + * `Bun.sleep` that keeps the Bun event loop alive past its intended + * shutdown — most commonly in telemetry / Sentry flush, the git + * credential helper, or the WSL fallback in `resolveCommand`. + * + * Strategy: run short commands end-to-end via the real CLI entry + * and assert wall-clock time stays under a generous threshold. A + * lingering timer immediately pushes the time past the threshold + * (the old tail was ~3s on Windows); genuinely slow runs on cold + * CI still fit well within the budget. + * + * The thresholds are intentionally generous so the test doesn't + * flake on slow runners — they're chosen to catch the *regression* + * (commands lingering 3s+) without failing on normal CI noise. + * Don't tighten these to assert absolute performance targets; do + * that elsewhere if needed. The job of these tests is one thing: + * catch the exit tail returning. + */ + +import { describe, expect, test } from "bun:test"; +import { resolve } from "node:path"; + +const CLI_PATH = resolve(import.meta.dir, "..", "..", "src", "cli.ts"); + +/** + * Ceiling for a trivially-fast command. The historical regression + * pushed this to 3.5–4s. Normal runs sit well under 2s even on slow + * CI. 4000ms catches the regression with plenty of headroom. + */ +const FAST_COMMAND_MAX_MS = 4000; + +/** + * Run the CLI with the given args and return the wall-clock duration. + * `NODE_ENV=test` suppresses actual telemetry event capture so no real + * traffic is sent, but the telemetry / Sentry SDKs still initialize + * and flush — which is exactly the path the timer-leak regression + * lived on. Leaving `ARCHGATE_TELEMETRY` unset means we exercise the + * enabled path; the `_=test` env guard inside `trackEvent` / + * `Sentry.init`'s `enabled` flag prevents real event delivery. + */ +async function timeCli(args: string[]): Promise { + const start = performance.now(); + const proc = Bun.spawn(["bun", "run", CLI_PATH, ...args], { + stdout: "pipe", + stderr: "pipe", + env: { + ...process.env, + NO_COLOR: "1", + NODE_ENV: "test", + // Intentionally NOT disabling telemetry — we want the SDK init + + // flush path to run so any leaked timer shows up as a wall-clock + // regression. The NODE_ENV=test guard suppresses event delivery. + }, + }); + + // Drain streams so the spawn can exit cleanly even if stdout/stderr + // fills a pipe buffer. We don't assert on content here — other tests + // cover correctness. + await Promise.all([ + new Response(proc.stdout).text(), + new Response(proc.stderr).text(), + proc.exited, + ]); + return performance.now() - start; +} + +/** Run a command N times and return the median duration. */ +async function medianDurationMs( + args: string[], + runs: number +): Promise<{ median: number; all: number[] }> { + const samples: number[] = []; + for (let i = 0; i < runs; i++) { + // oxlint-disable-next-line no-await-in-loop -- runs are serial on purpose + samples.push(await timeCli(args)); + } + const sorted = [...samples].sort((a, b) => a - b); + const median = sorted[Math.floor(sorted.length / 2)]; + return { median, all: samples }; +} + +describe("CLI performance — exit tail regression guard", () => { + test( + "`--version` finishes within budget (no leaked exit-path timer)", + async () => { + // 3 runs + median smooths out a single slow cold-start without + // letting a genuine regression slip through. + const { median, all } = await medianDurationMs(["--version"], 3); + if (median >= FAST_COMMAND_MAX_MS) { + // Rich failure message — makes debugging fast when CI flakes. + throw new Error( + `\`archgate --version\` took ${Math.round(median)}ms (median of ${all.map((m) => Math.round(m)).join(", ")}ms). ` + + `Budget is ${FAST_COMMAND_MAX_MS}ms. ` + + `This usually means a new un-cancelled \`setTimeout\` / \`Bun.sleep\` is keeping the event loop alive after the command completes. ` + + `Grep for \`Promise.race\` + \`setTimeout\` and make sure every timer id is captured and \`clearTimeout\`'d in a \`.finally\`.` + ); + } + expect(median).toBeLessThan(FAST_COMMAND_MAX_MS); + }, + // Per-test timeout: allow 4× budget so we report a clean failure + // rather than a timeout if something is badly wrong. + FAST_COMMAND_MAX_MS * 4 + ); + + test( + "`--help` finishes within budget (no leaked exit-path timer)", + async () => { + const { median, all } = await medianDurationMs(["--help"], 3); + if (median >= FAST_COMMAND_MAX_MS) { + throw new Error( + `\`archgate --help\` took ${Math.round(median)}ms (median of ${all.map((m) => Math.round(m)).join(", ")}ms). ` + + `Budget is ${FAST_COMMAND_MAX_MS}ms. See the \`--version\` test failure message for the likely cause.` + ); + } + expect(median).toBeLessThan(FAST_COMMAND_MAX_MS); + }, + FAST_COMMAND_MAX_MS * 4 + ); +});