From b62b6e8e0a43a9cecc56e3b4d950426f45bb1e5d Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Fri, 15 May 2026 10:26:27 -0400 Subject: [PATCH 1/3] createRealm: enqueue exactly one priority-10 index job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threads a skipFromScratchIndex option through reconciler.lookupOrMount → ensureMounted → realm.start → #startup. When set, #startup mounts the realm without enqueuing its own from-scratch-index job. createRealm now: 1. enqueues one from-scratch-index job at userInitiatedPriority 2. lookupOrMount({ skipFromScratchIndex: true }) — mounts without enqueuing a duplicate 3. awaits the priority-10 job's completion before returning Prior behaviour had two enqueue sites (explicit at priority 10, plus the implicit one via realm.start at default priority). They were intended to coalesce via chooseFromScratch keeping maxPriority, but a worker claim landing between the two inserts moved the first job into the in-flight bucket — which the from-scratch coalesce ignores — so the second job survived as a separate priority-0 row that could sit behind any backlog of system-priority indexing work. Tightened realm-lifecycle-test to assert exactly one job exists at userInitiatedPriority (was: "at least one, at least one at p10"). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../realm-server/handlers/create-realm.ts | 41 +++++++++++-------- .../lib/realm-registry-reconciler.ts | 14 +++++-- .../server-endpoints/realm-lifecycle-test.ts | 15 +++---- packages/runtime-common/realm.ts | 13 ++++-- 4 files changed, 51 insertions(+), 32 deletions(-) diff --git a/packages/realm-server/handlers/create-realm.ts b/packages/realm-server/handlers/create-realm.ts index 6d985d340e4..82d1707da6d 100644 --- a/packages/realm-server/handlers/create-realm.ts +++ b/packages/realm-server/handlers/create-realm.ts @@ -219,12 +219,14 @@ export async function createRealm( virtualNetwork.addURLMapping(new URL(url), actualRealmURL); } - // Phase 3: enqueue the from-scratch-index job at userInitiatedPriority - // so the canonical (post-coalesce) job carries that priority — even - // if reconciler.lookupOrMount below also enqueues one at the default - // systemInitiatedPriority via realm.start(). The chooseFromScratch - // coalesce JOINs same-realm jobs and keeps maxPriority. - await enqueueReindexRealmJob( + // Enqueue exactly one from-scratch-index job for this new realm, at + // userInitiatedPriority so a backed-up queue of system-priority jobs + // (e.g. a deploy-triggered reindex storm) does not stall realm + // creation. lookupOrMount is told to skip its own from-scratch + // enqueue via skipFromScratchIndex so this remains the only job — + // independent of whether the from-scratch coalesce would have caught + // a duplicate. + let indexJob = await enqueueReindexRealmJob( url, ownerUsername, queue, @@ -232,23 +234,28 @@ export async function createRealm( userInitiatedPriority, ); - // Synchronously mount + start the realm on the *handling* instance. - // The 202 response with status:'pending' is for sibling instances — - // they pick up the realm via NOTIFY realm_registry and lazy-mount - // on first request. On this instance the realm is fully ready by - // the time we return: ensureMounted publishes into realms[] / - // virtualNetwork via prepareRealmFromRow and awaits realm.start(), - // which awaits the from-scratch-index job. Mounting eagerly here - // also drains the queue locally so the test framework's teardown - // (close server → drain runner → close DB) doesn't race a worker - // mid-fetch on the now-closed HTTP listener. - let realm = await reconciler.lookupOrMount(url); + // Synchronously mount the realm on the *handling* instance. The 202 + // response with status:'pending' is for sibling instances — they + // pick up the realm via NOTIFY realm_registry and lazy-mount on + // first request. Mounting eagerly here also drains the queue + // locally so the test framework's teardown (close server → drain + // runner → close DB) doesn't race a worker mid-fetch on the now- + // closed HTTP listener. + let realm = await reconciler.lookupOrMount(url, { + skipFromScratchIndex: true, + }); if (!realm) { throw new Error( `expected realm ${url} to be mounted after createRealm — registry row missing or mount failed`, ); } + // Wait for the priority-10 job to complete so the realm is fully + // indexed by the time we return — preserving the prior "fully ready + // on this instance" contract without the duplicate-enqueue + // workaround. + await indexJob.done; + return { url, realm, info }; } diff --git a/packages/realm-server/lib/realm-registry-reconciler.ts b/packages/realm-server/lib/realm-registry-reconciler.ts index 96ca413dd40..aef001c985e 100644 --- a/packages/realm-server/lib/realm-registry-reconciler.ts +++ b/packages/realm-server/lib/realm-registry-reconciler.ts @@ -291,7 +291,10 @@ export class RealmRegistryReconciler { // would receive a not-yet-started Realm; routing it through the // in-flight promise instead lets the caller await start() like the // original requester. - async lookupOrMount(url: string): Promise { + async lookupOrMount( + url: string, + opts?: { skipFromScratchIndex?: boolean }, + ): Promise { const inflight = this.pendingMounts.get(url); if (inflight) { return inflight; @@ -308,7 +311,7 @@ export class RealmRegistryReconciler { } this.knownByUrl.set(url, row); } - return this.ensureMounted(row); + return this.ensureMounted(row, opts); } async #lookupRow(url: string): Promise { @@ -347,7 +350,10 @@ export class RealmRegistryReconciler { // rollout safety relies on this signal — Loki/Grafana extract cold- // mount latency, mount failure rate, and pinned-vs-lazy ratios from // these lines. - async ensureMounted(row: RealmRegistryRow): Promise { + async ensureMounted( + row: RealmRegistryRow, + opts?: { skipFromScratchIndex?: boolean }, + ): Promise { // pendingMounts checked before mounted: see lookupOrMount() above. // The Realm is published into mounted synchronously before its // start() promise resolves, so a caller hitting the mounted @@ -386,7 +392,7 @@ export class RealmRegistryReconciler { this.#reconcilerOwned.add(row.url); const promise = (async () => { try { - await realm.start(); + await realm.start(opts); log.info( `mount ok url=%s kind=%s pinned=%s duration_ms=%d`, row.url, diff --git a/packages/realm-server/tests/server-endpoints/realm-lifecycle-test.ts b/packages/realm-server/tests/server-endpoints/realm-lifecycle-test.ts index 5acebfb65ed..cb23e4c6d73 100644 --- a/packages/realm-server/tests/server-endpoints/realm-lifecycle-test.ts +++ b/packages/realm-server/tests/server-endpoints/realm-lifecycle-test.ts @@ -133,13 +133,14 @@ module(`server-endpoints/${basename(__filename)}`, function () { let jobs = (await context.dbAdapter.execute( `SELECT priority FROM jobs WHERE job_type = 'from-scratch-index' AND args->>'realmURL' = '${json.data.id}'`, )) as { priority: number }[]; - assert.ok( - jobs.length > 0, - 'found from-scratch index job for created realm', - ); - assert.ok( - jobs.some((j) => j.priority === userInitiatedPriority), - 'user initiated realm indexing uses high priority queue', + // Contract: realm creation enqueues exactly one + // from-scratch-index job, at userInitiatedPriority. A second + // job at default priority would block creation behind any + // backlog of lower-priority indexing work. + assert.deepEqual( + jobs.map((j) => j.priority), + [userInitiatedPriority], + 'realm creation enqueues exactly one from-scratch index job at userInitiatedPriority', ); let permissions = await fetchRealmPermissions( diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index d04568faaf8..e3ab4503cc1 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -1296,8 +1296,13 @@ export class Realm { }); } - async start() { - this.#startedUp.fulfill((() => this.#startup())()); + // `skipFromScratchIndex` lets a caller that has already enqueued a + // from-scratch-index job for this realm mount the realm without + // `#startup` enqueuing its own duplicate. The realm still mounts, + // its #startedUp promise still resolves, and request handlers can + // route to it — but indexing is then the caller's responsibility. + async start(opts?: { skipFromScratchIndex?: boolean }) { + this.#startedUp.fulfill((() => this.#startup(opts))()); if (this.#adapter.fileWatcherEnabled) { await this.startFileWatcher(); @@ -2174,7 +2179,7 @@ export class Realm { await completed; } - async #startup() { + async #startup(opts?: { skipFromScratchIndex?: boolean }) { await Promise.resolve(); let startTime = Date.now(); if (this.#copiedFromRealm) { @@ -2185,7 +2190,7 @@ export class Realm { sourceRealmURL: this.#copiedFromRealm.href, realmURL: this.url, }); - } else { + } else if (!opts?.skipFromScratchIndex) { let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); if (isNewIndex || this.#fullIndexOnStartup) { let promise = this.#realmIndexUpdater.fullIndex( From acd64ac50cb5bf9e80cc0c17ded5324cae2841a6 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Fri, 15 May 2026 10:32:10 -0400 Subject: [PATCH 2/3] createRealm: mount before publishing the index job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A worker claiming the priority-10 job between publish and the lookupOrMount call would fetch the new realm's _mtimes against this server-instance. That fetch would land in findOrMountRealm, whose lazy-mount path calls reconciler.lookupOrMount(url) without the skipFromScratchIndex option — re-introducing the priority-0 duplicate enqueue this PR is trying to eliminate. Reorder so the realm is mounted (via skipFromScratchIndex lookupOrMount) before the index job is published. Once the realm is in realms[] / virtualNetwork / the reconciler's `mounted` map, a worker fetch routed here resolves via the existing mount and never triggers the lazy-mount path. The sibling-instance race (reconciler NOTIFY in flight) remains but is the same pre-existing window that exists for any newly-created realm. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../realm-server/handlers/create-realm.ts | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/packages/realm-server/handlers/create-realm.ts b/packages/realm-server/handlers/create-realm.ts index 82d1707da6d..a40c879a4c7 100644 --- a/packages/realm-server/handlers/create-realm.ts +++ b/packages/realm-server/handlers/create-realm.ts @@ -219,25 +219,20 @@ export async function createRealm( virtualNetwork.addURLMapping(new URL(url), actualRealmURL); } - // Enqueue exactly one from-scratch-index job for this new realm, at - // userInitiatedPriority so a backed-up queue of system-priority jobs - // (e.g. a deploy-triggered reindex storm) does not stall realm - // creation. lookupOrMount is told to skip its own from-scratch - // enqueue via skipFromScratchIndex so this remains the only job — - // independent of whether the from-scratch coalesce would have caught - // a duplicate. - let indexJob = await enqueueReindexRealmJob( - url, - ownerUsername, - queue, - dbAdapter, - userInitiatedPriority, - ); - - // Synchronously mount the realm on the *handling* instance. The 202 - // response with status:'pending' is for sibling instances — they - // pick up the realm via NOTIFY realm_registry and lazy-mount on - // first request. Mounting eagerly here also drains the queue + // Mount the realm on the *handling* instance BEFORE publishing the + // index job. If a worker claimed the job between publish and the + // mount below, the worker's first `_mtimes` fetch against this + // server-instance would land in findOrMountRealm, whose lazy-mount + // path calls lookupOrMount without `skipFromScratchIndex` — and + // that lookupOrMount would enqueue the duplicate priority-0 job + // this code is trying to avoid. Mounting first means the realm is + // already in `realms[]` / `virtualNetwork` / the reconciler's + // `mounted` map by the time any worker fetch can route here, so the + // lazy-mount path never fires for it. + // + // The 202 response with status:'pending' is for sibling instances — + // they pick up the realm via NOTIFY realm_registry and lazy-mount + // on first request. Mounting eagerly here also drains the queue // locally so the test framework's teardown (close server → drain // runner → close DB) doesn't race a worker mid-fetch on the now- // closed HTTP listener. @@ -250,10 +245,20 @@ export async function createRealm( ); } + // Enqueue exactly one from-scratch-index job at userInitiatedPriority + // so a backed-up queue of system-priority jobs (e.g. a deploy- + // triggered reindex storm) does not stall realm creation. + let indexJob = await enqueueReindexRealmJob( + url, + ownerUsername, + queue, + dbAdapter, + userInitiatedPriority, + ); + // Wait for the priority-10 job to complete so the realm is fully // indexed by the time we return — preserving the prior "fully ready - // on this instance" contract without the duplicate-enqueue - // workaround. + // on this instance" contract. await indexJob.done; return { url, realm, info }; From 392ea9da7cf5159b6b0cc7d2265924f990b266f0 Mon Sep 17 00:00:00 2001 From: Hassan Abdel-Rahman Date: Fri, 15 May 2026 10:37:24 -0400 Subject: [PATCH 3/3] createRealm: thread fromScratchIndexPriority instead of skipping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier shape (skipFromScratchIndex + explicit enqueueReindexRealmJob in createRealm) bypassed RealmIndexUpdater.publishFullIndex, so the mounted realm's #stats, #ignoreData, and #ignoreDataVersion never updated when the job completed. Replace skipFromScratchIndex with fromScratchIndexPriority. The mount pipeline itself drives the one-and-only from-scratch job at the chosen priority: lookupOrMount(url, { fromScratchIndexPriority }) → ensureMounted(row, opts) → realm.start(opts) → #startup(opts) → #realmIndexUpdater.fullIndex(opts.fromScratchIndexPriority ?? this.#fromScratchIndexPriority) → publishFullIndex(...) // .then updates #stats/#ignoreData/... publishFullIndex remains the single source of truth for full-index state updates; createRealm just picks the priority. prepareRealmFromRow publishes the realm into realms[] / virtualNetwork synchronously, so worker self-fetches that race the mount still resolve via the existing mount and never re-enter the lazy-mount path. Drop the now-unused enqueueReindexRealmJob + queue dep from create-realm.ts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../realm-server/handlers/create-realm.ts | 41 +++++-------------- .../lib/realm-registry-reconciler.ts | 4 +- packages/realm-server/routes.ts | 1 - packages/runtime-common/realm.ts | 23 ++++++----- 4 files changed, 24 insertions(+), 45 deletions(-) diff --git a/packages/realm-server/handlers/create-realm.ts b/packages/realm-server/handlers/create-realm.ts index a40c879a4c7..7e9e5f7f425 100644 --- a/packages/realm-server/handlers/create-realm.ts +++ b/packages/realm-server/handlers/create-realm.ts @@ -4,7 +4,6 @@ import { ensureDirSync, writeJSONSync } from 'fs-extra'; import * as Sentry from '@sentry/node'; import type { DBAdapter, - QueuePublisher, Realm, RealmInfo, VirtualNetwork, @@ -19,7 +18,6 @@ import { SupportedMimeType, userInitiatedPriority, } from '@cardstack/runtime-common'; -import { enqueueReindexRealmJob } from '@cardstack/runtime-common/jobs/reindex-realm'; import { getMatrixUsername } from '@cardstack/runtime-common/matrix-client'; import { insertSourceRealmInRegistry } from '../lib/realm-registry-writes'; import type { RealmRegistryReconciler } from '../lib/realm-registry-reconciler'; @@ -37,7 +35,6 @@ export type CreateRealmDeps = { serverURL: URL; realms: Realm[]; dbAdapter: DBAdapter; - queue: QueuePublisher; virtualNetwork: VirtualNetwork; realmsRootPath: string; reconciler: RealmRegistryReconciler; @@ -78,7 +75,6 @@ export async function createRealm( serverURL, realms, dbAdapter, - queue, virtualNetwork, realmsRootPath, reconciler, @@ -219,16 +215,15 @@ export async function createRealm( virtualNetwork.addURLMapping(new URL(url), actualRealmURL); } - // Mount the realm on the *handling* instance BEFORE publishing the - // index job. If a worker claimed the job between publish and the - // mount below, the worker's first `_mtimes` fetch against this - // server-instance would land in findOrMountRealm, whose lazy-mount - // path calls lookupOrMount without `skipFromScratchIndex` — and - // that lookupOrMount would enqueue the duplicate priority-0 job - // this code is trying to avoid. Mounting first means the realm is - // already in `realms[]` / `virtualNetwork` / the reconciler's - // `mounted` map by the time any worker fetch can route here, so the - // lazy-mount path never fires for it. + // Mount the realm on the *handling* instance and let the mount + // pipeline itself drive the one-and-only from-scratch-index, at + // userInitiatedPriority so a backed-up queue of system-priority + // jobs (e.g. a deploy-triggered reindex storm) does not stall realm + // creation. lookupOrMount → ensureMounted → realm.start → #startup + // sees `isNewIndex = true` for a freshly-registered realm and + // enqueues exactly one job via publishFullIndex, which also updates + // the realm's in-memory #stats / #ignoreData / #ignoreDataVersion + // when the job completes. // // The 202 response with status:'pending' is for sibling instances — // they pick up the realm via NOTIFY realm_registry and lazy-mount @@ -237,7 +232,7 @@ export async function createRealm( // runner → close DB) doesn't race a worker mid-fetch on the now- // closed HTTP listener. let realm = await reconciler.lookupOrMount(url, { - skipFromScratchIndex: true, + fromScratchIndexPriority: userInitiatedPriority, }); if (!realm) { throw new Error( @@ -245,22 +240,6 @@ export async function createRealm( ); } - // Enqueue exactly one from-scratch-index job at userInitiatedPriority - // so a backed-up queue of system-priority jobs (e.g. a deploy- - // triggered reindex storm) does not stall realm creation. - let indexJob = await enqueueReindexRealmJob( - url, - ownerUsername, - queue, - dbAdapter, - userInitiatedPriority, - ); - - // Wait for the priority-10 job to complete so the realm is fully - // indexed by the time we return — preserving the prior "fully ready - // on this instance" contract. - await indexJob.done; - return { url, realm, info }; } diff --git a/packages/realm-server/lib/realm-registry-reconciler.ts b/packages/realm-server/lib/realm-registry-reconciler.ts index aef001c985e..a2e828fcd02 100644 --- a/packages/realm-server/lib/realm-registry-reconciler.ts +++ b/packages/realm-server/lib/realm-registry-reconciler.ts @@ -293,7 +293,7 @@ export class RealmRegistryReconciler { // original requester. async lookupOrMount( url: string, - opts?: { skipFromScratchIndex?: boolean }, + opts?: { fromScratchIndexPriority?: number }, ): Promise { const inflight = this.pendingMounts.get(url); if (inflight) { @@ -352,7 +352,7 @@ export class RealmRegistryReconciler { // these lines. async ensureMounted( row: RealmRegistryRow, - opts?: { skipFromScratchIndex?: boolean }, + opts?: { fromScratchIndexPriority?: number }, ): Promise { // pendingMounts checked before mounted: see lookupOrMount() above. // The Realm is published into mounted synchronously before its diff --git a/packages/realm-server/routes.ts b/packages/realm-server/routes.ts index 44bc4ee2f0e..f3bd90c4719 100644 --- a/packages/realm-server/routes.ts +++ b/packages/realm-server/routes.ts @@ -121,7 +121,6 @@ export function createRoutes(args: CreateRoutesArgs) { serverURL: new URL(args.serverURL), realms: args.realms, dbAdapter: args.dbAdapter, - queue: args.queue, virtualNetwork: args.virtualNetwork, realmsRootPath: args.realmsRootPath, reconciler: args.reconciler, diff --git a/packages/runtime-common/realm.ts b/packages/runtime-common/realm.ts index e3ab4503cc1..e6e024128d3 100644 --- a/packages/runtime-common/realm.ts +++ b/packages/runtime-common/realm.ts @@ -1296,12 +1296,13 @@ export class Realm { }); } - // `skipFromScratchIndex` lets a caller that has already enqueued a - // from-scratch-index job for this realm mount the realm without - // `#startup` enqueuing its own duplicate. The realm still mounts, - // its #startedUp promise still resolves, and request handlers can - // route to it — but indexing is then the caller's responsibility. - async start(opts?: { skipFromScratchIndex?: boolean }) { + // `fromScratchIndexPriority` overrides the realm's default priority + // for the from-scratch-index job that `#startup` enqueues when the + // realm has no prior index. Callers that mount-on-demand for a + // user-initiated flow (e.g. realm creation) pass + // `userInitiatedPriority` so the resulting job jumps ahead of any + // backlog of system-priority indexing work. + async start(opts?: { fromScratchIndexPriority?: number }) { this.#startedUp.fulfill((() => this.#startup(opts))()); if (this.#adapter.fileWatcherEnabled) { @@ -2179,7 +2180,7 @@ export class Realm { await completed; } - async #startup(opts?: { skipFromScratchIndex?: boolean }) { + async #startup(opts?: { fromScratchIndexPriority?: number }) { await Promise.resolve(); let startTime = Date.now(); if (this.#copiedFromRealm) { @@ -2190,12 +2191,12 @@ export class Realm { sourceRealmURL: this.#copiedFromRealm.href, realmURL: this.url, }); - } else if (!opts?.skipFromScratchIndex) { + } else { let isNewIndex = await this.#realmIndexUpdater.isNewIndex(); if (isNewIndex || this.#fullIndexOnStartup) { - let promise = this.#realmIndexUpdater.fullIndex( - this.#fromScratchIndexPriority, - ); + let priority = + opts?.fromScratchIndexPriority ?? this.#fromScratchIndexPriority; + let promise = this.#realmIndexUpdater.fullIndex(priority); if (isNewIndex) { // we only await the full indexing at boot if this is a brand new index await promise;