From 1e5fa195599e8bf8656ba809ae6f65a2718e1f45 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Thu, 30 Apr 2026 19:52:45 -0400 Subject: [PATCH 1/2] fix: GC orphan summary rows after each summarize pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaced by end-to-end measurement: after running `--force` with two different chat models (Phase A-MLX with qwen3-coder, then Phase A bridge+batched with claude-haiku-4-5), the codegraph self-repo's symbol_summaries table held 723 rows — 675 fresh haiku entries plus 48 stale qwen3-coder entries whose node_ids no longer exist in the current `nodes` table. The orphans accumulated because the cache- preservation work in bd26945 (clearStructural with FK toggle) left summary rows behind on every --force, and rows that the by-content- hash fallback in summarizeAll didn't reclaim onto a current node_id were never cleaned up. Bookkeeping noise (inflated row counts, misleading Summary coverage metrics) and slow content_hash index growth — no logical bug, but worth fixing while the design is fresh. ## Implementation New `QueryBuilder.pruneOrphanSummaries()` runs: DELETE FROM symbol_summaries WHERE node_id NOT IN (SELECT id FROM nodes) inside a transaction, with before/after row counts captured atomically in the same transaction. Returns `{ summariesDeleted, embeddingsDeleted }`. Embedding rows cascade-delete via the existing FK CASCADE on `symbol_embeddings.node_id REFERENCES symbol_summaries(node_id)`. Wired into `startBackgroundSummarization` to run after Phase 1 (summarize) completes, before Phase 2 (embed). Logged via logDebug only when something was actually deleted. ## Timing rationale MUST run AFTER summarizeAll completes — by then any genuinely relocated summaries (function moved files, line shifted, extraction- version drift) have been re-attached to their new node_ids via the content-hash fallback inside summarizeAll. Calling DURING summarizeAll would race the reclaim and lose cache hits. NOT called in no-chat configurations: orphans there might still be legitimately reclaimable by a FUTURE chat-enabled session via the content-hash fallback. Pruning prematurely would force the next chat session to regenerate from scratch. Acceptable cost (storage growth in configs that never gain a chat backend) for correctness in the more common case. Documented in the method JSDoc. ## Tests `__tests__/migrations-022.test.ts` — 2 new tests: - `pruneOrphanSummaries deletes only summaries whose node_id is gone` — sets up 3 nodes + 3 summaries + 2 embeddings, simulates clearStructural by toggling FKs OFF and DELETing 2 nodes (the exact post-clearStructural-then-reindex state), then prunes. Asserts: 2 summaries deleted, 1 embedding cascade-deleted, 1 live summary + 1 live embedding survive. - `pruneOrphanSummaries is a no-op when nothing is orphaned` — asserts zero deletions and zero state change when all summaries have matching nodes (idempotent on healthy DB). Suite: 1095 / 13 skip / 0 fail (was 1093). ## Reviewer trail Two passes. Pass 1: REQUEST_CHANGES + 1 substantive finding (the four COUNT(*) reads were taken outside the DELETE transaction, leaving an inter-process race window where the embeddingsDeleted count could be wrong) + 2 info findings (directory_summaries scope, no-chat limitation). Substantive fix: wrapped all four counts AND the DELETE inside a single this.db.transaction(() => { ... return obj; })() so the snapshot is atomic. Info finding 3 addressed by adding a JSDoc paragraph documenting the no-chat-session reasoning. Pass 2: APPROVE + 1 info finding (the WASM adapter doesn't support nested transactions — not actionable here since pruneOrphanSummaries is never called from another transaction). ## Out of scope (deferred to follow-up) - `directory_summaries` rows for deleted directories accumulate similarly. Different keying (dir_path, not node_id) means a different prune query. Worth a future PR. - WASM adapter savepoint support — the WasmDatabaseAdapter.transaction() uses bare BEGIN/COMMIT, no savepoints. Future callers that nest pruneOrphanSummaries inside another transaction would hit a SQLite "cannot start a transaction within a transaction" error on WASM only. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/migrations-022.test.ts | 82 ++++++++++++++++++++++++++++++++ src/db/queries.ts | 57 ++++++++++++++++++++++ src/index.ts | 17 +++++++ 3 files changed, 156 insertions(+) diff --git a/__tests__/migrations-022.test.ts b/__tests__/migrations-022.test.ts index 53a4e2e0..c5f695a7 100644 --- a/__tests__/migrations-022.test.ts +++ b/__tests__/migrations-022.test.ts @@ -158,6 +158,88 @@ describe('Migration 022 — content_hash index + structural-clear preservation', } }); + it('pruneOrphanSummaries deletes only summaries whose node_id is gone', () => { + const dbPath = path.join(dir, 'prune.db'); + const conn = DatabaseConnection.initialize(dbPath); + try { + const raw = conn.getDb(); + const queries = new QueryBuilder(raw); + + // All three nodes exist initially (so FK on summaries is happy). + const insertNode = raw.prepare(` + INSERT INTO nodes (id, kind, name, qualified_name, file_path, language, + start_line, end_line, start_column, end_column, updated_at) + VALUES (?, 'function', 'foo', 'foo', 'a.ts', 'typescript', 1, 5, 0, 0, 0) + `); + insertNode.run('n1'); + insertNode.run('n2'); + insertNode.run('n3'); + + raw.prepare(` + INSERT INTO symbol_summaries (node_id, content_hash, summary, model, generated_at) + VALUES ('n1', 'h1', 'live summary', 'm', 100), + ('n2', 'h2', 'orphan A', 'm', 100), + ('n3', 'h3', 'orphan B', 'm', 100) + `).run(); + + const liveVec = Buffer.from(new Float32Array([1, 0, 0]).buffer); + const orphanVec = Buffer.from(new Float32Array([0, 1, 0]).buffer); + raw.prepare(` + INSERT INTO symbol_embeddings (node_id, embedding, embedding_model, source_content_hash) + VALUES ('n1', ?, 'em', 'src-1'), + ('n2', ?, 'em', 'src-2') + `).run(liveVec, orphanVec); + + // Simulate clearStructural's effect: delete n2/n3 from `nodes` + // with FKs OFF, leaving the summary + embedding rows orphaned. + // This is the exact post-clearStructural-then-reindex state where + // pruneOrphanSummaries is meant to clean up. + raw.exec('PRAGMA foreign_keys = OFF'); + raw.prepare("DELETE FROM nodes WHERE id IN ('n2', 'n3')").run(); + raw.exec('PRAGMA foreign_keys = ON'); + + const result = queries.pruneOrphanSummaries(); + + // Two summaries removed (n2, n3) — n1 stayed. + expect(result.summariesDeleted).toBe(2); + // One embedding removed via FK cascade (n2 had an embedding). + expect(result.embeddingsDeleted).toBe(1); + + // Verify final state: only the live row remains. + expect((raw.prepare('SELECT COUNT(*) AS c FROM symbol_summaries').get() as { c: number }).c).toBe(1); + expect((raw.prepare('SELECT COUNT(*) AS c FROM symbol_embeddings').get() as { c: number }).c).toBe(1); + const survivor = raw.prepare("SELECT summary FROM symbol_summaries WHERE node_id = 'n1'").get() as { summary: string }; + expect(survivor.summary).toBe('live summary'); + } finally { + conn.close(); + } + }); + + it('pruneOrphanSummaries is a no-op when nothing is orphaned', () => { + const dbPath = path.join(dir, 'noop.db'); + const conn = DatabaseConnection.initialize(dbPath); + try { + const raw = conn.getDb(); + const queries = new QueryBuilder(raw); + raw.prepare(` + INSERT INTO nodes (id, kind, name, qualified_name, file_path, language, + start_line, end_line, start_column, end_column, updated_at) + VALUES ('n1', 'function', 'foo', 'foo', 'a.ts', 'typescript', 1, 5, 0, 0, 0) + `).run(); + raw.prepare(` + INSERT INTO symbol_summaries (node_id, content_hash, summary, model, generated_at) + VALUES ('n1', 'h', 's', 'm', 100) + `).run(); + + const result = queries.pruneOrphanSummaries(); + expect(result.summariesDeleted).toBe(0); + expect(result.embeddingsDeleted).toBe(0); + expect((raw.prepare('SELECT COUNT(*) AS c FROM symbol_summaries').get() as { c: number }).c).toBe(1); + } finally { + conn.close(); + } + }); + it('clear (full reset) wipes summaries too', () => { const dbPath = path.join(dir, 'wipe.db'); const conn = DatabaseConnection.initialize(dbPath); diff --git a/src/db/queries.ts b/src/db/queries.ts index 43ce5a35..3fc6ed6b 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -2788,6 +2788,63 @@ export class QueryBuilder { return rows.map(rowToNode); } + /** + * Garbage-collect summary rows whose `node_id` no longer exists in + * `nodes`. These accumulate across `--force` re-indexes when a symbol + * gets a new node_id (line shift, file move, or extraction-version + * bump). The `summarizeAll` content-hash fallback reclaims them onto + * the new node_id when the body matches, but rows that nobody claimed + * stick around forever otherwise — which inflates the row count, + * confuses `Summary coverage` reporting, and slows the + * `(content_hash, model)` index over time. + * + * Cascading FK on `symbol_embeddings` means deleted summaries take + * their corresponding embeddings with them in the same transaction. + * `directory_summaries` is keyed by dir_path (not node_id) and is + * untouched here. + * + * Safe to call AFTER `summarizeAll` completes — by then any genuinely + * relocated summaries have been re-attached to their new node_ids + * via the content-hash fallback. Calling DURING summarizeAll would + * race the reclaim and lose cache hits. + * + * NOT called in no-chat configurations: orphans there might still be + * legitimately reclaimable by a FUTURE chat-enabled session (the + * by-content-hash fallback inside summarizeAll is only triggered when + * there's a chat backend to call when reclaim misses). Pruning during + * a no-chat-only run would force the next chat session to regenerate + * those summaries from scratch. Acceptable cost (storage growth in + * configs that never gain a chat backend) for correctness in the more + * common case. + */ + pruneOrphanSummaries(): { summariesDeleted: number; embeddingsDeleted: number } { + // All four counts and the DELETE share one transaction so the + // before/after snapshot is atomic. Without this, a concurrent + // process writing or deleting embeddings between the SELECT and + // the DELETE would produce a wrong `embeddingsDeleted` count. + return this.db.transaction(() => { + const summariesBefore = (this.db + .prepare('SELECT COUNT(*) AS c FROM symbol_summaries') + .get() as { c: number }).c; + const embeddingsBefore = (this.db + .prepare('SELECT COUNT(*) AS c FROM symbol_embeddings') + .get() as { c: number }).c; + this.db.exec( + 'DELETE FROM symbol_summaries WHERE node_id NOT IN (SELECT id FROM nodes)' + ); + const summariesAfter = (this.db + .prepare('SELECT COUNT(*) AS c FROM symbol_summaries') + .get() as { c: number }).c; + const embeddingsAfter = (this.db + .prepare('SELECT COUNT(*) AS c FROM symbol_embeddings') + .get() as { c: number }).c; + return { + summariesDeleted: summariesBefore - summariesAfter, + embeddingsDeleted: embeddingsBefore - embeddingsAfter, + }; + })(); + } + /** * Read a single symbol's cached summary, or null if none exists. */ diff --git a/src/index.ts b/src/index.ts index 401cccfc..c9654613 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1115,6 +1115,23 @@ export class CodeGraph { errors: result.errors, durationMs: result.durationMs, }); + + // Garbage-collect summary rows whose node_id no longer exists. + // These accumulate across --force re-indexes when symbols get + // new node_ids (line shift, file move, extraction-version + // bump). MUST run AFTER summarizeAll so the content-hash + // fallback inside summarizeAll has had a chance to reclaim + // renames onto current node_ids — anything left here is truly + // orphaned. Embeddings cascade-delete via FK. + if (!controller.signal.aborted) { + const pruned = this.queries.pruneOrphanSummaries(); + if (pruned.summariesDeleted > 0 || pruned.embeddingsDeleted > 0) { + logDebug('Pruned orphan summaries', { + summariesDeleted: pruned.summariesDeleted, + embeddingsDeleted: pruned.embeddingsDeleted, + }); + } + } } // Phase-2: embed the summaries so semantic search has data. From 9adac6338edd94296ca6f4c43fa92fba07ccec95 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Thu, 30 Apr 2026 20:20:36 -0400 Subject: [PATCH 2/2] fix: prune orphan directory_summaries; WASM adapter savepoint nesting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the two follow-ups noted in 1e5fa19's commit message: ## directory_summaries orphan pruning Same issue `pruneOrphanSummaries` fixed for symbol_summaries, but for directory_summaries — when a dir is deleted/renamed/excluded its summary lingers indefinitely (no FK anchor; dir_path is the PK). `QueryBuilder.pruneOrphanDirectorySummaries()`: - Reads all `files.path` rows. - For each file, computes the IMMEDIATE PARENT directory via `path.posix.dirname(filePath.replace(/\\/g, '/'))` — exactly mirroring what `dir-summarizer.ts:groupByDir` writes. Crucially does NOT walk the ancestor chain (the first draft did, which would let stale intermediate-dir summaries survive — e.g. a `src` summary from when files lived directly in `src/` but have since all moved into `src/core/`). - Inserts the live set into a `TEMP TABLE _live_dirs` and runs `DELETE FROM directory_summaries WHERE dir_path NOT IN (...)`. - All four COUNT(*) reads + the DELETE share one transaction so the before/after snapshot is atomic (same race fix as `pruneOrphanSummaries`). - Returns `{ directorySummariesDeleted }`. Wired into `startBackgroundSummarization` to run after Phase 3 (dir summarizer). Same timing rationale as symbol prune: must run AFTER the writer has had its chance to refresh stale entries, or we'd race its writes for newly-summarised dirs. ### Tests in `__tests__/migrations-022.test.ts` (3 new) - Live dirs preserved, orphan dirs pruned (mixed scenario with 4 indexed files spanning src/a, src/b, and src directly). - Empty files table prunes everything (degenerate case). - **Stale ancestor-only dir IS pruned** — regression test for the reviewer-caught under-pruning bug. Indexes only `src/core/foo.ts`, plants summaries for both `src/core` (live) and `src` (stale ancestor-only). Asserts only `src/core` survives. Pre-fix this would have left `src` alive due to the ancestor walk. ## WASM adapter savepoint nesting `node-sqlite3-wasm` doesn't transparently handle nested transactions like better-sqlite3 does. Pre-fix, a caller wrapping a `transaction()` inside another `transaction()` would get "cannot start a transaction within a transaction" — only on the WASM backend, silently OK on native. The reviewer flagged this in 1e5fa19's pass 1 as out-of-scope. `WasmDatabaseAdapter.transaction()` in `src/db/sqlite-adapter.ts`: - Added `private _txDepth = 0` field tracking nesting depth on the connection. - Outer call (depth 0) → BEGIN/COMMIT/ROLLBACK. - Inner call (depth > 0) → SAVEPOINT/RELEASE/ROLLBACK TO + RELEASE. - Savepoint name `cg_sp_${depth}` is depth-tagged so concurrent depth values produce stable distinct names; safe to inline (depth is a non-negative integer, no SQL-injection risk). - Rollback paths wrapped in try/catch — if the rollback itself fails, the original error is annotated with a "(NOTE: ROLLBACK also failed: ...)" suffix before re-throw. Operators reading the log get a signal when connection state may be uncertain. - `finally { this._txDepth-- }` restores depth on both success and failure paths. `WasmDatabaseAdapter` is now exported (was un-exported `class`) so tests can drive it directly on machines where better-sqlite3 is installed and `createDatabase` would otherwise pick native. Brief JSDoc on the export explains the test-only intent. ### Tests in `__tests__/wasm-savepoint.test.ts` (NEW, 8 tests) - Single-level commit - Nested commit (outer + inner both succeed) - Inner throw + outer recovers (writes survive even though inner's SAVEPOINT was rolled back) — this is THE case that errored pre-fix. - Outer throw → full rollback (zero rows survive) - Depth counter resets on success AND failure paths (third-run-after- failure case proves the `finally` restores depth on both branches). - Three-deep nesting with L3 failure rolled back inside L2. - transaction() return value passes through. - transaction() args forwarded to wrapped function. Suite: 1106 / 13 skip / 0 fail (was 1095, +11 new tests). ## Reviewer trail Two passes. Pass 1: REQUEST_CHANGES + 1 substantive (under-pruning ancestor-walk bug — would let stale `src` summary survive when files all moved into `src/core`) + 1 info (silently swallowed outer-rollback errors). Substantive: rewrote prune to use immediate-parent only + added regression test that would have failed pre-fix. Info: annotate caught rollback errors onto the original exception message. Pass 2: REQUEST_CHANGES + 1 finding (stale JSDoc still described the removed ancestor-walk behavior) + 1 info (frozen-Error edge case in error mutation — theoretical, not actionable). JSDoc fixed. Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/migrations-022.test.ts | 101 ++++++++++++++++++++ __tests__/wasm-savepoint.test.ts | 156 +++++++++++++++++++++++++++++++ src/db/queries.ts | 66 +++++++++++++ src/db/sqlite-adapter.ts | 57 ++++++++++- src/index.ts | 11 +++ 5 files changed, 387 insertions(+), 4 deletions(-) create mode 100644 __tests__/wasm-savepoint.test.ts diff --git a/__tests__/migrations-022.test.ts b/__tests__/migrations-022.test.ts index c5f695a7..0eec2472 100644 --- a/__tests__/migrations-022.test.ts +++ b/__tests__/migrations-022.test.ts @@ -215,6 +215,107 @@ describe('Migration 022 — content_hash index + structural-clear preservation', } }); + it('pruneOrphanDirectorySummaries deletes only dirs with no indexed files', () => { + const dbPath = path.join(dir, 'prune-dirs.db'); + const conn = DatabaseConnection.initialize(dbPath); + try { + const raw = conn.getDb(); + const queries = new QueryBuilder(raw); + + // Indexed files: 2 in src/a, 1 in src/b, 1 directly in src/. + // Live = exactly the immediate parents the dir-summarizer would + // have written for: src/a, src/b, and src (only because main.ts + // lives directly in src/). NOT all ancestors — see the comment + // in pruneOrphanDirectorySummaries explaining why we don't walk + // the chain (would let stale ancestor-only summaries survive). + const insFile = raw.prepare(` + INSERT INTO files (path, language, content_hash, modified_at, size, indexed_at) + VALUES (?, 'typescript', 'h', 0, 100, 0) + `); + insFile.run('src/a/foo.ts'); + insFile.run('src/a/bar.ts'); + insFile.run('src/b/baz.ts'); + insFile.run('src/main.ts'); + + // Plant 5 dir summaries: 3 live, 2 orphan. + const insDir = raw.prepare(` + INSERT INTO directory_summaries (dir_path, summary, content_hash, model, generated_at) + VALUES (?, ?, 'h', 'm', 100) + `); + insDir.run('src/a', 'live a'); + insDir.run('src/b', 'live b'); + insDir.run('src', 'live root (has main.ts directly)'); + insDir.run('src/c', 'orphan deleted dir'); + insDir.run('vendor/lib', 'orphan outside tree'); + + const result = queries.pruneOrphanDirectorySummaries(); + + expect(result.directorySummariesDeleted).toBe(2); + + const remaining = (raw + .prepare('SELECT dir_path FROM directory_summaries ORDER BY dir_path') + .all() as Array<{ dir_path: string }>) + .map((r) => r.dir_path); + expect(remaining).toEqual(['src', 'src/a', 'src/b']); + } finally { + conn.close(); + } + }); + + it('pruneOrphanDirectorySummaries: stale ancestor-only dir IS pruned', () => { + // Regression for an under-pruning bug: if files used to live + // directly in `src/` (so the dir-summarizer wrote a `src` summary) + // but have all moved into subdirs like `src/core/`, the stale + // `src` summary must be pruned. The earlier draft walked the + // ancestor chain and would have kept it. + const dbPath = path.join(dir, 'prune-dirs-ancestor.db'); + const conn = DatabaseConnection.initialize(dbPath); + try { + const raw = conn.getDb(); + const queries = new QueryBuilder(raw); + + raw.prepare(` + INSERT INTO files (path, language, content_hash, modified_at, size, indexed_at) + VALUES ('src/core/foo.ts', 'typescript', 'h', 0, 100, 0) + `).run(); + + raw.prepare(` + INSERT INTO directory_summaries (dir_path, summary, content_hash, model, generated_at) + VALUES ('src/core', 'live', 'h', 'm', 100), + ('src', 'stale ancestor — no file lives directly here', 'h', 'm', 100) + `).run(); + + const result = queries.pruneOrphanDirectorySummaries(); + expect(result.directorySummariesDeleted).toBe(1); + const remaining = (raw + .prepare('SELECT dir_path FROM directory_summaries') + .all() as Array<{ dir_path: string }>) + .map((r) => r.dir_path); + expect(remaining).toEqual(['src/core']); + } finally { + conn.close(); + } + }); + + it('pruneOrphanDirectorySummaries: empty files table prunes everything', () => { + const dbPath = path.join(dir, 'prune-dirs-empty.db'); + const conn = DatabaseConnection.initialize(dbPath); + try { + const raw = conn.getDb(); + const queries = new QueryBuilder(raw); + raw.prepare(` + INSERT INTO directory_summaries (dir_path, summary, content_hash, model, generated_at) + VALUES ('src/a', 's', 'h', 'm', 100), ('src/b', 's', 'h', 'm', 100) + `).run(); + + const result = queries.pruneOrphanDirectorySummaries(); + expect(result.directorySummariesDeleted).toBe(2); + expect((raw.prepare('SELECT COUNT(*) AS c FROM directory_summaries').get() as { c: number }).c).toBe(0); + } finally { + conn.close(); + } + }); + it('pruneOrphanSummaries is a no-op when nothing is orphaned', () => { const dbPath = path.join(dir, 'noop.db'); const conn = DatabaseConnection.initialize(dbPath); diff --git a/__tests__/wasm-savepoint.test.ts b/__tests__/wasm-savepoint.test.ts new file mode 100644 index 00000000..18356373 --- /dev/null +++ b/__tests__/wasm-savepoint.test.ts @@ -0,0 +1,156 @@ +/** + * WASM adapter — nested transaction semantics via savepoints. + * + * `node-sqlite3-wasm` only supports a single top-level transaction at + * a time. Bare BEGIN inside an active transaction errors with "cannot + * start a transaction within a transaction". Our adapter wraps that + * into a SAVEPOINT/RELEASE/ROLLBACK TO scheme so callers can nest + * `transaction()` calls transparently — matching better-sqlite3's + * native nested-transaction support. + * + * These tests import `WasmDatabaseAdapter` directly so they exercise + * the WASM path even on machines where better-sqlite3 is also + * installed (where `createDatabase` would otherwise pick native). + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { WasmDatabaseAdapter } from '../src/db/sqlite-adapter'; + +function tempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-wasm-')); +} + +function cleanup(dir: string): void { + if (fs.existsSync(dir)) fs.rmSync(dir, { recursive: true, force: true }); +} + +describe('WasmDatabaseAdapter — nested transactions via savepoints', () => { + let dir: string; + let db: WasmDatabaseAdapter; + beforeEach(() => { + dir = tempDir(); + db = new WasmDatabaseAdapter(path.join(dir, 'test.db')); + db.exec('CREATE TABLE t (id INTEGER PRIMARY KEY, val TEXT NOT NULL)'); + }); + afterEach(() => { + db.close(); + cleanup(dir); + }); + + it('single-level transaction commits writes', () => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('a')"); + })(); + const rows = db.prepare('SELECT val FROM t').all(); + expect(rows).toEqual([{ val: 'a' }]); + }); + + it('nested commit: outer + inner both commit', () => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('outer')"); + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('inner')"); + })(); + })(); + const vals = (db.prepare('SELECT val FROM t ORDER BY val').all() as Array<{ val: string }>) + .map((r) => r.val); + expect(vals).toEqual(['inner', 'outer']); + }); + + it('inner throw: only inner rolls back, outer commits', () => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('outer-keeps')"); + try { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('inner-discards')"); + throw new Error('inner failure'); + })(); + } catch { + // swallow — testing that outer can recover from inner failure + } + db.exec("INSERT INTO t (val) VALUES ('outer-after-recovery')"); + })(); + const vals = (db.prepare('SELECT val FROM t ORDER BY val').all() as Array<{ val: string }>) + .map((r) => r.val); + // 'inner-discards' is gone; everything outer wrote (before AND + // after the inner failure) survives. Pre-fix this would have + // thrown on the inner BEGIN with "cannot start a transaction + // within a transaction". + expect(vals).toEqual(['outer-after-recovery', 'outer-keeps']); + }); + + it('outer throw: full rollback, no rows survive', () => { + expect(() => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('a')"); + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('b')"); + })(); + throw new Error('outer failure'); + })(); + }).toThrow('outer failure'); + expect((db.prepare('SELECT COUNT(*) AS c FROM t').get() as { c: number }).c).toBe(0); + }); + + it('depth counter resets after both success and failure paths', () => { + // First run: success → depth back to 0. + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('first')"); + })(); + // Second run: same depth-zero starting point, must use BEGIN + // (not SAVEPOINT). If the depth counter wasn't restored after the + // first run's finally, this would raise "no such savepoint". + expect(() => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('second')"); + throw new Error('second failure'); + })(); + }).toThrow('second failure'); + // Third run: again depth-zero. The error path's finally must have + // restored depth too. + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('third')"); + })(); + const vals = (db.prepare('SELECT val FROM t ORDER BY val').all() as Array<{ val: string }>) + .map((r) => r.val); + expect(vals).toEqual(['first', 'third']); + }); + + it('three-deep nesting: each level commits or rolls back independently', () => { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('L1')"); + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('L2')"); + try { + db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('L3-discards')"); + throw new Error('L3 fails'); + })(); + } catch { /* swallow */ } + db.exec("INSERT INTO t (val) VALUES ('L2-survives-L3-failure')"); + })(); + })(); + const vals = (db.prepare('SELECT val FROM t ORDER BY val').all() as Array<{ val: string }>) + .map((r) => r.val); + expect(vals).toEqual(['L1', 'L2', 'L2-survives-L3-failure']); + }); + + it('transaction() return value passes through', () => { + const result = db.transaction(() => { + db.exec("INSERT INTO t (val) VALUES ('x')"); + return 42; + })(); + expect(result).toBe(42); + }); + + it('transaction() forwards args to the wrapped function', () => { + const seen: number[] = []; + const tx = db.transaction((a: number, b: number) => { + seen.push(a, b); + }); + tx(7, 11); + expect(seen).toEqual([7, 11]); + }); +}); diff --git a/src/db/queries.ts b/src/db/queries.ts index 3fc6ed6b..8df02538 100644 --- a/src/db/queries.ts +++ b/src/db/queries.ts @@ -4,6 +4,7 @@ * Prepared statements for CRUD operations on the knowledge graph. */ +import * as path from 'path'; import { SqliteDatabase, SqliteStatement } from './sqlite-adapter'; import { Node, @@ -2845,6 +2846,71 @@ export class QueryBuilder { })(); } + /** + * Garbage-collect `directory_summaries` rows whose `dir_path` no + * longer corresponds to any indexed file. These accumulate when a + * directory is deleted from disk, renamed, or removed via include- + * pattern changes — the summary row sticks around forever otherwise + * because there's no FK anchor. + * + * Live directories are derived from `files.path`: each indexed file + * contributes ONLY its immediate parent directory (no ancestor + * walk). Paths are normalised to posix forward slashes to match + * how `dir-summarizer.ts:groupByDir` constructs `dir_path` + * (`path.posix.dirname(filePath.replace(/\\/g, '/'))`). Walking + * ancestors here would let stale intermediate-dir summaries survive + * — e.g. a `src` summary written when files lived directly in + * `src/` but have all since moved into `src/core/` etc. + * + * Safe to call AFTER `summarizeAllDirectories` completes. Calling + * during would race the dir-summarizer's own writes for newly-added + * dirs. + */ + pruneOrphanDirectorySummaries(): { directorySummariesDeleted: number } { + return this.db.transaction(() => { + const files = this.db + .prepare('SELECT path FROM files') + .all() as Array<{ path: string }>; + + // Live dirs are EXACTLY the immediate parents of indexed files. + // Don't walk the ancestor chain: `dir-summarizer.ts:groupByDir` + // (the only writer of directory_summaries) only emits a summary + // for `path.posix.dirname(filePath)` — i.e. the directory the + // file lives directly in. Walking ancestors here would mark + // intermediate dirs as live and miss orphans like a `src` + // summary that exists from when files lived directly in `src/` + // but have since all moved into `src/core/` etc. + const liveDirs = new Set(); + for (const { path: fp } of files) { + const d = path.posix.dirname(fp.replace(/\\/g, '/')); + if (d !== '.' && d !== '/' && d !== '') { + liveDirs.add(d); + } + } + + const before = (this.db + .prepare('SELECT COUNT(*) AS c FROM directory_summaries') + .get() as { c: number }).c; + + // Use a temp table to avoid building an enormous IN(?,?,...) clause + // for repos with thousands of directories. Drop on commit so it + // doesn't survive the transaction. + this.db.exec('DROP TABLE IF EXISTS _live_dirs'); + this.db.exec('CREATE TEMP TABLE _live_dirs (dir TEXT PRIMARY KEY)'); + const insLive = this.db.prepare('INSERT OR IGNORE INTO _live_dirs (dir) VALUES (?)'); + for (const d of liveDirs) insLive.run(d); + this.db.exec( + 'DELETE FROM directory_summaries WHERE dir_path NOT IN (SELECT dir FROM _live_dirs)' + ); + this.db.exec('DROP TABLE _live_dirs'); + + const after = (this.db + .prepare('SELECT COUNT(*) AS c FROM directory_summaries') + .get() as { c: number }).c; + return { directorySummariesDeleted: before - after }; + })(); + } + /** * Read a single symbol's cached summary, or null if none exists. */ diff --git a/src/db/sqlite-adapter.ts b/src/db/sqlite-adapter.ts index 01155fc4..f2a18c7a 100644 --- a/src/db/sqlite-adapter.ts +++ b/src/db/sqlite-adapter.ts @@ -83,11 +83,22 @@ function resolveParams(params: any[], paramOrder: string[] | null): any { * - node-sqlite3-wasm doesn't have a `pragma()` method * - node-sqlite3-wasm doesn't have a `transaction()` method */ -class WasmDatabaseAdapter implements SqliteDatabase { +/** + * Exported only so tests can drive the WASM transaction-nesting code + * path on machines where better-sqlite3 is also installed (the + * `createDatabase` factory always picks native there). Production + * callers should go through `createDatabase`. + */ +export class WasmDatabaseAdapter implements SqliteDatabase { private _db: any; // Track raw WASM statements so we can finalize them on close. // node-sqlite3-wasm won't release its file lock if statements are left open. private _openStmts = new Set(); + // Depth counter for nested transactions. node-sqlite3-wasm does NOT + // implement nesting natively (better-sqlite3 does it via savepoints + // transparently), so we use SAVEPOINT/RELEASE/ROLLBACK TO ourselves + // when an outer transaction is already active. + private _txDepth = 0; constructor(dbPath: string) { // eslint-disable-next-line @typescript-eslint/no-require-imports @@ -166,14 +177,52 @@ class WasmDatabaseAdapter implements SqliteDatabase { transaction(fn: (...args: any[]) => T): (...args: any[]) => T { return (...args: any[]) => { - this._db.exec('BEGIN'); + const isOuter = this._txDepth === 0; + // Use a depth-tagged name so a fn that itself opens transactions + // produces nested savepoints with stable, distinct names. Names + // are safe to inline into SQL: depth is an unsigned integer. + const sp = `cg_sp_${this._txDepth}`; + if (isOuter) { + this._db.exec('BEGIN'); + } else { + this._db.exec(`SAVEPOINT ${sp}`); + } + this._txDepth++; try { const result = fn(...args); - this._db.exec('COMMIT'); + if (isOuter) { + this._db.exec('COMMIT'); + } else { + this._db.exec(`RELEASE ${sp}`); + } return result; } catch (error) { - this._db.exec('ROLLBACK'); + // Wrap rollbacks in try/catch so a thrown rollback (e.g. + // transaction already aborted by SQLite) doesn't mask the + // original error. Annotate the original error with a hint + // when the rollback itself failed — operators reading the log + // need to know the connection state may be uncertain. + if (isOuter) { + try { + this._db.exec('ROLLBACK'); + } catch (rbErr) { + if (error instanceof Error) { + error.message += ` (NOTE: outer ROLLBACK also failed: ${rbErr instanceof Error ? rbErr.message : String(rbErr)} — connection state uncertain)`; + } + } + } else { + try { + this._db.exec(`ROLLBACK TO ${sp}`); + this._db.exec(`RELEASE ${sp}`); + } catch (rbErr) { + if (error instanceof Error) { + error.message += ` (NOTE: SAVEPOINT rollback also failed: ${rbErr instanceof Error ? rbErr.message : String(rbErr)})`; + } + } + } throw error; + } finally { + this._txDepth--; } }; } diff --git a/src/index.ts b/src/index.ts index c9654613..7737ea8f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1183,6 +1183,17 @@ export class CodeGraph { errors: dResult.errors, durationMs: dResult.durationMs, }); + + // Same orphan-GC pattern as symbol summaries (1e5fa19), but + // for directory_summaries. Live dirs derived from files.path. + // Runs after the dir-summarizer completes so we don't race + // its writes for newly-summarised dirs. + if (!controller.signal.aborted) { + const dirPruned = this.queries.pruneOrphanDirectorySummaries(); + if (dirPruned.directorySummariesDeleted > 0) { + logDebug('Pruned orphan directory summaries', dirPruned); + } + } } // Phase-4: role classification. One short call per symbol;