From 6c54f8dccf26ed4bab7cdc8d219d205f7d8e0bbd Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 20:06:10 -0700 Subject: [PATCH 1/8] feat(wiki): add active-use recents LRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A doubly-linked-list + map ring (O(1) touch/remove/rename) tracking pages the user or agent actively used — Create, Update, Get, Move (both ends), Delete, GetBacklinks — rather than what disk mtime says was last changed. Distinguishes 'intent' from 'sync churn' for the upcoming digest's recents signal. Touches fire only on the success path: a failed CreatePage on an existing page, or a GetPage on a missing path, does not pollute the ring. MovePage renames in place so a move shows up as one continuous use rather than dropping the old name and freshly inserting the new. DeletePage drops the entry — a deleted page in recents would mislead the agent. Capacity defaults to 20 (the plan's recents_size default); a later step will swap this for a config-driven value. Persistence lives in state.go (next steps); recentsLRU itself is storage-agnostic, exposing load/snapshot/takeDirty for a ticker to consume. Step 1 of the digest plan (mind-map/plans/digest). --- internal/wiki/pages.go | 38 +++++- internal/wiki/recents.go | 198 +++++++++++++++++++++++++++++ internal/wiki/recents_test.go | 230 ++++++++++++++++++++++++++++++++++ internal/wiki/wiki.go | 20 ++- 4 files changed, 478 insertions(+), 8 deletions(-) create mode 100644 internal/wiki/recents.go create mode 100644 internal/wiki/recents_test.go diff --git a/internal/wiki/pages.go b/internal/wiki/pages.go index 38291ec..9e4de30 100644 --- a/internal/wiki/pages.go +++ b/internal/wiki/pages.go @@ -50,6 +50,11 @@ func (w *Wiki) GetPage(ctx context.Context, pagePath string) (*Page, error) { slog.Warn("failed to get backlinks", slog.String("page", pagePath), slog.Any("error", err)) } + // LRU touch reflects that the agent actually saw this page. We + // only reach here on a successful row scan, so a typo'd path that + // hit the "page not found" branch above will not pollute recents. + w.recents.touch(pagePath) + return &Page{ Path: pagePath, Title: title, @@ -148,7 +153,11 @@ func (w *Wiki) CreatePage(ctx context.Context, pagePath string, content string) } slog.Info("page created", slog.String("page", pagePath)) - return w.indexPage(ctx, pagePath) + if err := w.indexPage(ctx, pagePath); err != nil { + return err + } + w.recents.touch(pagePath) + return nil } // UpdatePage replaces the content of an existing page. @@ -178,7 +187,11 @@ func (w *Wiki) UpdatePage(ctx context.Context, pagePath string, content string) } slog.Info("page updated", slog.String("page", pagePath)) - return w.indexPage(ctx, pagePath) + if err := w.indexPage(ctx, pagePath); err != nil { + return err + } + w.recents.touch(pagePath) + return nil } // DeletePage removes a page from the filesystem and index. @@ -204,7 +217,13 @@ func (w *Wiki) DeletePage(ctx context.Context, pagePath string) error { } slog.Info("page deleted", slog.String("page", pagePath)) - return w.removePageIndex(ctx, pagePath) + if err := w.removePageIndex(ctx, pagePath); err != nil { + return err + } + // The page is gone; leaving it in recents would point the agent + // at a 404. Drop the entry rather than promote it. + w.recents.remove(pagePath) + return nil } // ErrDestinationExists is returned by MovePage when the destination @@ -310,6 +329,10 @@ func (w *Wiki) MovePage(ctx context.Context, fromPath, toPath string, opts MoveO return fmt.Errorf("index new page: %w", err) } + // Treat a move as one continuous "active use" rather than dropping + // the old name and freshly inserting the new one. See recentsLRU.rename. + w.recents.rename(from, to) + slog.Info("page moved", slog.String("from", from), slog.String("to", to), @@ -359,7 +382,14 @@ func (w *Wiki) GetBacklinks(ctx context.Context, pagePath string) ([]string, err return nil, err } - return w.getBacklinks(ctx, pagePath) + backlinks, err := w.getBacklinks(ctx, pagePath) + if err != nil { + return nil, err + } + // GetBacklinks is "I'm looking at this page's incoming links" — + // an active use of the target page, even if its body wasn't read. + w.recents.touch(pagePath) + return backlinks, nil } // Link is a single source→target edge between two pages. diff --git a/internal/wiki/recents.go b/internal/wiki/recents.go new file mode 100644 index 0000000..2e70226 --- /dev/null +++ b/internal/wiki/recents.go @@ -0,0 +1,198 @@ +package wiki + +import ( + "container/list" + "sync" +) + +// recentsLRU is a fixed-capacity, most-recently-used-first ring of page +// paths. It tracks pages the user or agent has *actively* touched — +// Create, Update, Get, Move (both ends), Delete, GetBacklinks — rather +// than what disk mtime says was changed most recently. The distinction +// matters when sync's copyToWiki bumps mtimes for files the agent never +// looked at; an mtime-based "recents" would surface those, an LRU +// reflects intent. +// +// The structure is a doubly-linked list plus a path->element index, so +// touch / remove / rename are all O(1). It is safe for concurrent use. +// +// Persistence (snapshot to SQLite on a slow ticker) lives in state.go +// and the ticker lives in the wiki lifecycle code; recentsLRU itself +// is intentionally storage-agnostic. +type recentsLRU struct { + mu sync.Mutex + cap int + // ll holds paths with the most recently used at the front. + ll *list.List + // idx maps path -> list element for O(1) promote/remove. + idx map[string]*list.Element + // dirty is true when the in-memory state has diverged from the last + // persisted snapshot. The persistence ticker reads + clears it. + dirty bool +} + +// newRecentsLRU constructs an empty LRU with the given capacity. +// A non-positive cap is clamped to the default (20). +func newRecentsLRU(cap int) *recentsLRU { + if cap <= 0 { + cap = 20 + } + return &recentsLRU{ + cap: cap, + ll: list.New(), + idx: make(map[string]*list.Element), + } +} + +// touch records that the given page was just used. If the page is +// already in the ring it's promoted to the front; otherwise it's +// inserted at the front and the oldest entry is evicted if the ring +// is at capacity. +// +// Empty paths are ignored — callers don't need to guard the call site. +func (r *recentsLRU) touch(path string) { + if path == "" { + return + } + r.mu.Lock() + defer r.mu.Unlock() + + if elem, ok := r.idx[path]; ok { + r.ll.MoveToFront(elem) + r.dirty = true + return + } + elem := r.ll.PushFront(path) + r.idx[path] = elem + if r.ll.Len() > r.cap { + oldest := r.ll.Back() + if oldest != nil { + r.ll.Remove(oldest) + delete(r.idx, oldest.Value.(string)) + } + } + r.dirty = true +} + +// remove drops a path from the ring. Called when a page is deleted; +// the path is gone so including it in recents would mislead the agent. +// No-op if the path isn't tracked. +func (r *recentsLRU) remove(path string) { + if path == "" { + return + } + r.mu.Lock() + defer r.mu.Unlock() + + elem, ok := r.idx[path] + if !ok { + return + } + r.ll.Remove(elem) + delete(r.idx, path) + r.dirty = true +} + +// rename relabels an entry in place, preserving its position in the +// ring. Called on MovePage so the move shows up as one touch (at the +// new name) rather than two (old name drops out, new name is fresh). +// +// If `from` isn't tracked, behaves as touch(to). If `to` is already +// tracked, the older `from` entry is removed and `to` is promoted — +// this is the same as if the agent had used `to` directly. +func (r *recentsLRU) rename(from, to string) { + if from == to { + r.touch(to) + return + } + r.mu.Lock() + defer r.mu.Unlock() + + fromElem, hasFrom := r.idx[from] + toElem, hasTo := r.idx[to] + + switch { + case hasFrom && hasTo: + // Both present: drop `from`, promote `to`. + r.ll.Remove(fromElem) + delete(r.idx, from) + r.ll.MoveToFront(toElem) + case hasFrom: + // Relabel in place at the same position. + fromElem.Value = to + delete(r.idx, from) + r.idx[to] = fromElem + r.ll.MoveToFront(fromElem) + case hasTo: + r.ll.MoveToFront(toElem) + default: + // Neither tracked: insert `to` fresh. + elem := r.ll.PushFront(to) + r.idx[to] = elem + if r.ll.Len() > r.cap { + oldest := r.ll.Back() + if oldest != nil { + r.ll.Remove(oldest) + delete(r.idx, oldest.Value.(string)) + } + } + } + r.dirty = true +} + +// snapshot returns the tracked paths, most recent first. The returned +// slice is owned by the caller and safe to mutate. +func (r *recentsLRU) snapshot() []string { + r.mu.Lock() + defer r.mu.Unlock() + + out := make([]string, 0, r.ll.Len()) + for e := r.ll.Front(); e != nil; e = e.Next() { + out = append(out, e.Value.(string)) + } + return out +} + +// load replaces the ring's contents with the given paths (treated as +// most-recent-first). Used by the persistence layer on Wiki.Open to +// restore the last snapshot. Clears the dirty flag — what we just +// loaded matches what's on disk. +func (r *recentsLRU) load(paths []string) { + r.mu.Lock() + defer r.mu.Unlock() + + r.ll.Init() + r.idx = make(map[string]*list.Element, len(paths)) + for _, p := range paths { + if p == "" { + continue + } + if _, dup := r.idx[p]; dup { + continue + } + elem := r.ll.PushBack(p) + r.idx[p] = elem + if r.ll.Len() >= r.cap { + break + } + } + r.dirty = false +} + +// takeDirty returns whether the ring has unsaved changes and clears +// the flag in the same operation. The persistence ticker uses this to +// skip writes when nothing has changed. +func (r *recentsLRU) takeDirty() bool { + r.mu.Lock() + defer r.mu.Unlock() + was := r.dirty + r.dirty = false + return was +} + +// len returns the number of tracked paths. Test helper. +func (r *recentsLRU) len() int { + r.mu.Lock() + defer r.mu.Unlock() + return r.ll.Len() +} diff --git a/internal/wiki/recents_test.go b/internal/wiki/recents_test.go new file mode 100644 index 0000000..e016a9e --- /dev/null +++ b/internal/wiki/recents_test.go @@ -0,0 +1,230 @@ +package wiki + +import ( + "context" + "reflect" + "testing" +) + +func TestRecentsLRU_TouchAndOrder(t *testing.T) { + r := newRecentsLRU(3) + + r.touch("a") + r.touch("b") + r.touch("c") + if got, want := r.snapshot(), []string{"c", "b", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after initial touches: got %v, want %v", got, want) + } + + // Re-touching an existing entry promotes it. + r.touch("a") + if got, want := r.snapshot(), []string{"a", "c", "b"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after promote: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_Eviction(t *testing.T) { + r := newRecentsLRU(2) + r.touch("a") + r.touch("b") + r.touch("c") // evicts "a" + + got := r.snapshot() + if len(got) != 2 || got[0] != "c" || got[1] != "b" { + t.Fatalf("expected [c b], got %v", got) + } +} + +func TestRecentsLRU_EmptyTouchIgnored(t *testing.T) { + r := newRecentsLRU(5) + r.touch("") + if r.len() != 0 { + t.Fatalf("empty path should not be tracked, len=%d", r.len()) + } +} + +func TestRecentsLRU_Remove(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") + + r.remove("b") + if got, want := r.snapshot(), []string{"c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after remove b: got %v, want %v", got, want) + } + + // Remove of missing path is a no-op. + r.remove("zzz") + if got, want := r.snapshot(), []string{"c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("noop remove changed state: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameInPlace(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") // order: c b a + + // Rename "b" -> "x": should land at the front (promoted) per the + // plan's "treat a move as active use of the new name" rule. + r.rename("b", "x") + if got, want := r.snapshot(), []string{"x", "c", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename b->x: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameDestExists(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + r.touch("b") + r.touch("c") // c b a + + // Rename "a" -> "c" (overwrite move): the old "a" entry should + // drop out, "c" should be promoted. + r.rename("a", "c") + if got, want := r.snapshot(), []string{"c", "b"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename a->c (dest exists): got %v, want %v", got, want) + } +} + +func TestRecentsLRU_RenameFromMissing(t *testing.T) { + r := newRecentsLRU(5) + r.touch("a") + // Rename of an untracked source: equivalent to touching the dest. + r.rename("zzz", "b") + if got, want := r.snapshot(), []string{"b", "a"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after rename zzz->b: got %v, want %v", got, want) + } +} + +func TestRecentsLRU_LoadSnapshotRoundtrip(t *testing.T) { + r := newRecentsLRU(5) + r.load([]string{"a", "b", "c"}) + + if got, want := r.snapshot(), []string{"a", "b", "c"}; !reflect.DeepEqual(got, want) { + t.Fatalf("after load: got %v, want %v", got, want) + } + if r.takeDirty() { + t.Fatalf("load should clear dirty flag") + } +} + +func TestRecentsLRU_LoadRespectsCapacity(t *testing.T) { + r := newRecentsLRU(2) + r.load([]string{"a", "b", "c", "d"}) + if r.len() != 2 { + t.Fatalf("load should stop at cap; len=%d", r.len()) + } +} + +func TestRecentsLRU_DirtyTracking(t *testing.T) { + r := newRecentsLRU(3) + if r.takeDirty() { + t.Fatalf("fresh ring should not be dirty") + } + r.touch("a") + if !r.takeDirty() { + t.Fatalf("touch should mark dirty") + } + if r.takeDirty() { + t.Fatalf("takeDirty should clear the flag") + } +} + +// --- integration: touches fire on the right Wiki ops --- + +func TestWiki_LRUIntegration(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // testWiki() seeded the wiki, and Reindex on Open() does not touch + // the LRU (indexing is plumbing, not "user used the page"). + if got := w.recents.snapshot(); len(got) != 0 { + t.Fatalf("LRU should be empty after Open; got %v", got) + } + + // GetPage touches. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if got := w.recents.snapshot(); !reflect.DeepEqual(got, []string{"index"}) { + t.Fatalf("after GetPage: got %v", got) + } + + // Failed GetPage does NOT touch. + if _, err := w.GetPage(ctx, "does/not/exist"); err == nil { + t.Fatalf("expected error on missing page") + } + if got := w.recents.snapshot(); !reflect.DeepEqual(got, []string{"index"}) { + t.Fatalf("failed GetPage polluted LRU: %v", got) + } + + // CreatePage touches. + if err := w.CreatePage(ctx, "scratch", "# Scratch\n"); err != nil { + t.Fatalf("CreatePage: %v", err) + } + if got := w.recents.snapshot(); got[0] != "scratch" { + t.Fatalf("CreatePage should put scratch at front: %v", got) + } + + // UpdatePage touches. + if err := w.UpdatePage(ctx, "index", "# Welcome (updated)\n"); err != nil { + t.Fatalf("UpdatePage: %v", err) + } + if got := w.recents.snapshot(); got[0] != "index" { + t.Fatalf("UpdatePage should promote index: %v", got) + } + + // GetBacklinks touches. + if _, err := w.GetBacklinks(ctx, "projects/mind-map"); err != nil { + t.Fatalf("GetBacklinks: %v", err) + } + if got := w.recents.snapshot(); got[0] != "projects/mind-map" { + t.Fatalf("GetBacklinks should promote target: %v", got) + } + + // MovePage renames in the LRU. + if err := w.MovePage(ctx, "scratch", "notes/scratch", MoveOptions{}); err != nil { + t.Fatalf("MovePage: %v", err) + } + snap := w.recents.snapshot() + for _, p := range snap { + if p == "scratch" { + t.Fatalf("old name still in LRU after move: %v", snap) + } + } + if snap[0] != "notes/scratch" { + t.Fatalf("move dest should be at front: %v", snap) + } + + // DeletePage removes. + if err := w.DeletePage(ctx, "notes/scratch"); err != nil { + t.Fatalf("DeletePage: %v", err) + } + for _, p := range w.recents.snapshot() { + if p == "notes/scratch" { + t.Fatalf("deleted page still in LRU: %v", w.recents.snapshot()) + } + } +} + +// CreatePage that fails (page already exists) must NOT touch. +func TestWiki_LRUNoTouchOnFailedCreate(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Drain the LRU to a known state. + w.recents.load(nil) + + // "index" already exists in testWiki. + if err := w.CreatePage(ctx, "index", "# dup\n"); err == nil { + t.Fatalf("expected CreatePage to fail on existing page") + } + if got := w.recents.snapshot(); len(got) != 0 { + t.Fatalf("failed CreatePage polluted LRU: %v", got) + } +} diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index 84b4778..967e8f4 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -52,9 +52,13 @@ type WikiContext struct { // Wiki is the core engine. Create one with Open(). type Wiki struct { - root string // absolute path to wiki directory - db *sql.DB // SQLite database with FTS5 - sessionID string // unique ID for this process, used for page locks + root string // absolute path to wiki directory + db *sql.DB // SQLite database with FTS5 + sessionID string // unique ID for this process, used for page locks + // recents tracks pages the user/agent has actively touched. See + // recents.go for the rationale (intent vs. disk mtime). Persistence + // to SQLite is layered on in state.go; here it just lives in memory. + recents *recentsLRU } // Open opens (or creates) a wiki rooted at the given directory. @@ -79,7 +83,15 @@ func Open(root string) (*Wiki, error) { } sessionID := fmt.Sprintf("pid-%d-%d", os.Getpid(), time.Now().UnixNano()) - w := &Wiki{root: absRoot, db: db, sessionID: sessionID} + w := &Wiki{ + root: absRoot, + db: db, + sessionID: sessionID, + // Capacity 20 matches the plan default. Step 4 will swap this + // for a config-driven value (digest.recents_size); the default + // keeps existing callers unaffected. + recents: newRecentsLRU(20), + } if err := w.initSchema(); err != nil { db.Close() return nil, fmt.Errorf("init schema: %w", err) From f9a7213e149f7f82eded6814d78d72378663df33 Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 20:15:24 -0700 Subject: [PATCH 2/8] feat(wiki): add word/phrase cloud builder A deterministic, frequency-based summary of 'what is this wiki about'. One pass over pages.body produces unigram + bigram counts, filters through a built-in English stopword list (plus user extras), and selects top K with alphabetical tie-break for stable output across rebuilds. Tokenizer design notes: - Custom Go pass over pages.body rather than reaching into FTS5's C tokenizer. modernc.org/sqlite doesn't cleanly expose token sequences to Go, and reusing FTS5 would lose bigram ordering or drag in CGO-adjacent complexity. unicode61-equivalent for our purposes: lowercase, non-alnum split, with hyphens/underscores preserved mid-token so 'mind-map' and 'page_count' survive as one token each. - Wikilink brackets stripped so [[projects/mind-map]] contributes its target words to the cloud naturally. - Code fences and inline code are NOT stripped: identifiers in code are real 'about' signal in a technical wiki; dropping them would flatten the cloud. - Bigrams require both endpoints to pass the stopword filter (the plan's chosen lean on open question #2): 'the wiki' must not appear just because 'the' is high-frequency. - Single-char and all-digit tokens are dropped as a low-information short-circuit before the stopword map lookup. A single-slot cloudCache exposes Set/Get with defensive copies so the upcoming 5-minute rebuild ticker (Step 6) can swap clouds without readers racing on slice aliasing. Frequency, not TF-IDF, for v1 (plan open question #1 lean). Easy swap later if the cloud reads noisy in practice. Step 2 of the digest plan (mind-map/plans/digest). --- internal/wiki/cloud.go | 281 ++++++++++++++++++++++++++++++++++++ internal/wiki/cloud_test.go | 212 +++++++++++++++++++++++++++ 2 files changed, 493 insertions(+) create mode 100644 internal/wiki/cloud.go create mode 100644 internal/wiki/cloud_test.go diff --git a/internal/wiki/cloud.go b/internal/wiki/cloud.go new file mode 100644 index 0000000..a6f362e --- /dev/null +++ b/internal/wiki/cloud.go @@ -0,0 +1,281 @@ +package wiki + +import ( + "context" + "sort" + "strings" + "sync" + "unicode" +) + +// CloudTerm is a single entry in the rendered word/phrase cloud. +type CloudTerm struct { + Term string `json:"term"` + Count int `json:"count"` +} + +// defaultStopwords is the built-in English stopword list applied to +// every wiki's cloud. Users add domain-specific extras via config +// (digest.stopwords_extra) which are merged on top. +// +// Kept intentionally conservative: only true function words and the +// most generic English filler. Domain terms (even common ones like +// "wiki" or "page") are left to the per-wiki frequency signal to +// dampen — a wiki that's literally *about* wikis should be allowed +// to say so. +var defaultStopwords = map[string]struct{}{ + "a": {}, "an": {}, "and": {}, "are": {}, "as": {}, "at": {}, + "be": {}, "but": {}, "by": {}, "can": {}, + "do": {}, "does": {}, "for": {}, "from": {}, + "had": {}, "has": {}, "have": {}, "he": {}, "her": {}, "here": {}, + "hers": {}, "him": {}, "his": {}, "how": {}, + "i": {}, "if": {}, "in": {}, "into": {}, "is": {}, "it": {}, "its": {}, + "just": {}, "may": {}, "might": {}, "must": {}, + "no": {}, "not": {}, "now": {}, "of": {}, "off": {}, "on": {}, "one": {}, + "only": {}, "or": {}, "other": {}, "our": {}, "ours": {}, "out": {}, + "over": {}, "own": {}, + "s": {}, "she": {}, "should": {}, "so": {}, "some": {}, "such": {}, + "t": {}, "than": {}, "that": {}, "the": {}, "their": {}, "them": {}, + "then": {}, "there": {}, "these": {}, "they": {}, "this": {}, "those": {}, + "to": {}, "too": {}, + "under": {}, "until": {}, "up": {}, "upon": {}, + "was": {}, "we": {}, "were": {}, "what": {}, "when": {}, "where": {}, + "which": {}, "while": {}, "who": {}, "whom": {}, "why": {}, "will": {}, + "with": {}, "would": {}, + "you": {}, "your": {}, "yours": {}, +} + +// cloudBuilder accumulates unigram and bigram counts across pages. +// It is reset and re-run from scratch on each rebuild; the plan's +// 5-minute ticker (Step 6) calls Build() and stores the result. +type cloudBuilder struct { + stopwords map[string]struct{} +} + +// newCloudBuilder constructs a builder with the default stopword set +// merged with the user's extras. Extras are case-folded to match the +// tokenizer's lowercase output. +func newCloudBuilder(extra []string) *cloudBuilder { + sw := make(map[string]struct{}, len(defaultStopwords)+len(extra)) + for k := range defaultStopwords { + sw[k] = struct{}{} + } + for _, w := range extra { + w = strings.ToLower(strings.TrimSpace(w)) + if w != "" { + sw[w] = struct{}{} + } + } + return &cloudBuilder{stopwords: sw} +} + +// isStopword reports whether t is filtered out of the cloud. In +// addition to the configured stopword set, single-character tokens +// and pure-numeric tokens are dropped: neither carries useful "about" +// signal and both massively inflate the long tail. +func (b *cloudBuilder) isStopword(t string) bool { + if len(t) < 2 { + return true + } + if _, ok := b.stopwords[t]; ok { + return true + } + allDigit := true + for _, r := range t { + if !unicode.IsDigit(r) { + allDigit = false + break + } + } + return allDigit +} + +// tokenize splits a body into lowercase word tokens. The rules are +// deliberately simple and deterministic: +// +// - Lowercase everything. +// - A token is a maximal run of letters / digits / underscores / +// hyphens. Hyphens and underscores are kept because identifiers +// like "mind-map" or "page_count" are exactly the kinds of terms +// we want to surface intact. +// - Wikilink markup ([[...]]) is stripped but the target text +// inside is tokenized normally — a link to [[projects/mind-map]] +// contributes "projects" and "mind-map" to the page's tokens. +// - Markdown punctuation (#, *, _, `, etc.) becomes a separator. +// - Code fences and inline code are NOT stripped: code identifiers +// are part of what a technical wiki is about, and dropping them +// flattens the cloud. +func (b *cloudBuilder) tokenize(body string) []string { + // Cheaply strip the wikilink delimiters so [[a/b]] surfaces both + // "a" and "b" without us having to special-case the parser. The + // pipe form [[display|target]] is left as-is; the tokenizer's + // non-alnum-split will handle both halves. + body = strings.ReplaceAll(body, "[[", " ") + body = strings.ReplaceAll(body, "]]", " ") + + tokens := make([]string, 0, len(body)/6) + var cur strings.Builder + flush := func() { + if cur.Len() > 0 { + tokens = append(tokens, cur.String()) + cur.Reset() + } + } + for _, r := range body { + switch { + case unicode.IsLetter(r) || unicode.IsDigit(r): + cur.WriteRune(unicode.ToLower(r)) + case r == '-' || r == '_': + // Mid-token punctuation: keep only if it joins two + // alnum runs. Leading/trailing get trimmed below. + cur.WriteRune(r) + default: + flush() + } + } + flush() + + // Trim leading/trailing hyphens and underscores (e.g. "--foo") + // that survived the above without splitting cleanly. + for i, t := range tokens { + tokens[i] = strings.Trim(t, "-_") + } + return tokens +} + +// addPage folds one page's tokens into the running unigram and bigram +// counts. +// +// Bigrams require BOTH ends to pass the stopword filter (plan open +// question #2 lean): otherwise common phrases like "the wiki" would +// dominate purely because "the" is high-frequency, even though the +// pair is no more informative than "wiki" alone. +func (b *cloudBuilder) addPage(body string, unigrams, bigrams map[string]int) { + tokens := b.tokenize(body) + + var prev string + for _, t := range tokens { + if t == "" { + prev = "" + continue + } + stop := b.isStopword(t) + if !stop { + unigrams[t]++ + } + if prev != "" && !stop && !b.isStopword(prev) { + bigrams[prev+" "+t]++ + } + prev = t + } +} + +// topK selects the K highest-count entries from the given map. Ties +// break alphabetically so the output is stable across rebuilds — +// otherwise a digest cache invalidation could shuffle the cloud for +// no reason a user would understand. +func topK(counts map[string]int, k int) []CloudTerm { + if k <= 0 || len(counts) == 0 { + return nil + } + terms := make([]CloudTerm, 0, len(counts)) + for t, n := range counts { + terms = append(terms, CloudTerm{Term: t, Count: n}) + } + sort.Slice(terms, func(i, j int) bool { + if terms[i].Count != terms[j].Count { + return terms[i].Count > terms[j].Count + } + return terms[i].Term < terms[j].Term + }) + if len(terms) > k { + terms = terms[:k] + } + return terms +} + +// buildCloud computes the top-K most frequent terms across all page +// bodies. The result mixes unigrams and bigrams: bigrams are scored +// by their own frequency (no boost), so a phrase only beats a single +// word when it genuinely occurs more often. +// +// Caller owns the goroutine and the slot it's stored in; this function +// just does the work. Step 6 wires it to the 5-minute ticker. +func (w *Wiki) buildCloud(ctx context.Context, k int, stopwordsExtra []string) ([]CloudTerm, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + + rows, err := w.db.QueryContext(ctx, "SELECT body FROM pages") + if err != nil { + return nil, err + } + defer rows.Close() + + b := newCloudBuilder(stopwordsExtra) + unigrams := make(map[string]int) + bigrams := make(map[string]int) + + for rows.Next() { + if err := ctx.Err(); err != nil { + return nil, err + } + var body string + if err := rows.Scan(&body); err != nil { + continue + } + b.addPage(body, unigrams, bigrams) + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Merge the two count maps before selecting top-K. This lets a + // strong bigram outrank a weak unigram, and vice versa, on a + // single global scale. + merged := make(map[string]int, len(unigrams)+len(bigrams)) + for t, n := range unigrams { + merged[t] = n + } + for t, n := range bigrams { + merged[t] = n + } + return topK(merged, k), nil +} + +// cloudCache is a single-slot cache for the rebuilt cloud. The +// 5-minute ticker (Step 6) calls Set; readers (digest renderer) call +// Get. A read returns whatever was last set even if the ticker is +// behind — the digest's job is "good orientation," not "perfectly +// fresh stats." +type cloudCache struct { + mu sync.RWMutex + terms []CloudTerm + // set is true once Set has been called at least once. Readers + // distinguish "no cloud yet" (cold start) from "cloud is empty" + // (truly empty wiki) by checking set. + set bool +} + +func (c *cloudCache) Set(terms []CloudTerm) { + c.mu.Lock() + defer c.mu.Unlock() + // Defensive copy: caller may continue to mutate the slice. + cp := make([]CloudTerm, len(terms)) + copy(cp, terms) + c.terms = cp + c.set = true +} + +// Get returns a copy of the current cloud and whether one has been +// computed yet. +func (c *cloudCache) Get() ([]CloudTerm, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + if !c.set { + return nil, false + } + cp := make([]CloudTerm, len(c.terms)) + copy(cp, c.terms) + return cp, true +} diff --git a/internal/wiki/cloud_test.go b/internal/wiki/cloud_test.go new file mode 100644 index 0000000..208ccae --- /dev/null +++ b/internal/wiki/cloud_test.go @@ -0,0 +1,212 @@ +package wiki + +import ( + "context" + "reflect" + "strings" + "testing" +) + +func TestTokenize_Basic(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("Hello, world! This is mind-map.") + want := []string{"hello", "world", "this", "is", "mind-map"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("tokenize: got %v, want %v", got, want) + } +} + +func TestTokenize_KeepsHyphensAndUnderscores(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("page_count and mind-map are tokens") + if !contains(got, "page_count") { + t.Fatalf("expected page_count intact: %v", got) + } + if !contains(got, "mind-map") { + t.Fatalf("expected mind-map intact: %v", got) + } +} + +func TestTokenize_StripsWikilinkBrackets(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("see [[projects/mind-map]] for details") + // '/' is a separator, so we get the segments individually. + if !contains(got, "projects") || !contains(got, "mind-map") { + t.Fatalf("wikilink target words missing: %v", got) + } + for _, tok := range got { + if strings.ContainsAny(tok, "[]") { + t.Fatalf("bracket leaked into token %q", tok) + } + } +} + +func TestTokenize_LowercasesUnicode(t *testing.T) { + b := newCloudBuilder(nil) + got := b.tokenize("Привет МИР") + want := []string{"привет", "мир"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("tokenize unicode: got %v, want %v", got, want) + } +} + +func TestIsStopword(t *testing.T) { + b := newCloudBuilder([]string{"TODO"}) + cases := map[string]bool{ + "the": true, // default + "todo": true, // user-added, case-folded + "wiki": false, // domain term, not filtered + "a": true, // length<2 short-circuit (and in defaults) + "x": true, // length<2 + "42": true, // all-digit + "v1": false, // alnum mix, keep + "mind": false, + } + for tok, want := range cases { + if got := b.isStopword(tok); got != want { + t.Errorf("isStopword(%q) = %v, want %v", tok, got, want) + } + } +} + +func TestAddPage_UnigramAndBigramCounts(t *testing.T) { + b := newCloudBuilder(nil) + uni := map[string]int{} + bi := map[string]int{} + b.addPage("wiki engine. wiki engine.", uni, bi) + + if uni["wiki"] != 2 || uni["engine"] != 2 { + t.Fatalf("unigram counts wrong: %v", uni) + } + if bi["wiki engine"] != 2 { + t.Fatalf("bigram count wrong: %v", bi) + } + // "engine wiki" crosses a sentence boundary but our tokenizer + // treats '.' as a separator, not a sentence-aware split. The + // resulting bigram across "." is intentional — we don't have + // sentence info and a bigram across punctuation is still a + // real adjacent-token pair in the text. + if bi["engine wiki"] != 1 { + t.Fatalf("expected one engine->wiki bigram: %v", bi) + } +} + +func TestAddPage_StopwordsFilterBothBigramEnds(t *testing.T) { + b := newCloudBuilder(nil) + uni := map[string]int{} + bi := map[string]int{} + // "the wiki" → unigram "wiki" counts (the is stopword), + // but bigram "the wiki" must NOT be recorded. + b.addPage("the wiki is here. the wiki is here.", uni, bi) + + if _, ok := bi["the wiki"]; ok { + t.Fatalf("stopword-led bigram leaked: %v", bi) + } + if _, ok := bi["wiki is"]; ok { + t.Fatalf("stopword-tailed bigram leaked: %v", bi) + } + if uni["wiki"] != 2 { + t.Fatalf("unigram counts off: %v", uni) + } +} + +func TestTopK_OrderingAndTieBreak(t *testing.T) { + counts := map[string]int{ + "banana": 5, + "apple": 5, + "cherry": 3, + "date": 1, + } + got := topK(counts, 3) + want := []CloudTerm{ + {Term: "apple", Count: 5}, + {Term: "banana", Count: 5}, + {Term: "cherry", Count: 3}, + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("topK: got %v, want %v", got, want) + } +} + +func TestTopK_Empty(t *testing.T) { + if got := topK(nil, 5); got != nil { + t.Fatalf("nil input should return nil, got %v", got) + } + if got := topK(map[string]int{"a": 1}, 0); got != nil { + t.Fatalf("k=0 should return nil, got %v", got) + } +} + +func TestBuildCloud_EndToEnd(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Seed extra content that should dominate the cloud. + if err := w.CreatePage(ctx, "topics/sqlite", + "# SQLite\n\nSQLite is a database. SQLite is fast. SQLite is small.\n"); err != nil { + t.Fatalf("seed: %v", err) + } + + terms, err := w.buildCloud(ctx, 10, nil) + if err != nil { + t.Fatalf("buildCloud: %v", err) + } + if len(terms) == 0 { + t.Fatalf("expected non-empty cloud") + } + + // "sqlite" should be the top unigram now (4+ occurrences across pages). + found := false + for _, term := range terms { + if term.Term == "sqlite" { + found = true + if term.Count < 3 { + t.Errorf("sqlite count surprisingly low: %d", term.Count) + } + } + } + if !found { + t.Fatalf("sqlite missing from top-10: %v", terms) + } + + // No stopwords leaked. + for _, term := range terms { + for _, sw := range []string{"the", "is", "a", "and"} { + if term.Term == sw { + t.Errorf("stopword %q in cloud", sw) + } + } + } +} + +func TestCloudCache_RoundTrip(t *testing.T) { + c := &cloudCache{} + if got, ok := c.Get(); ok { + t.Fatalf("uninitialized cache should report not-set; got %v", got) + } + c.Set([]CloudTerm{{Term: "x", Count: 1}}) + got, ok := c.Get() + if !ok { + t.Fatalf("after Set, Get should report set") + } + if !reflect.DeepEqual(got, []CloudTerm{{Term: "x", Count: 1}}) { + t.Fatalf("roundtrip mismatch: %v", got) + } + + // Mutating the returned slice must not affect the cache. + got[0].Term = "MUTATED" + again, _ := c.Get() + if again[0].Term != "x" { + t.Fatalf("cache leaked internal state: %v", again) + } +} + +func contains(ss []string, s string) bool { + for _, x := range ss { + if x == s { + return true + } + } + return false +} From 2480dd68f872e28b0bf27c1fb47d0f578f51f725 Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 20:28:06 -0700 Subject: [PATCH 3/8] feat(wiki): assemble and render the digest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Digest() returns a structured Digest{PageCount, Cloud, Recents, Areas, Markdown} — the typed fields drive the WebUI / HTTP JSON, the markdown is what an LLM consumes in a per-conversation orientation prompt. Shape matches the example in the plan: This wiki contains N pages across M areas. About: term1, term2, term3, … ## Areas - foo (45) — foo/index: "Foo Area" - bar (12) - … ## Recently active - path/one - … Full skill: SKILL.md. Use `get_wiki_digest` for the live version. Trim discipline when over the soft cap (default 4096 bytes): drop recents from the tail first, then cloud, never areas. Areas are the smallest section and the most structurally important — losing them means losing the map of the wiki. Footer hint is also preserved. Caching is version-keyed: cloudCache and recentsLRU each expose a monotonic counter; digestCache stores (cloudVer, recentsSeq, pageCount) alongside the cached *Digest and rebuilds on any mismatch. pageCount is part of the key because pure content edits that don't touch the LRU still change the header sentence. CRUD operations automatically bust the cache through their existing LRU touches, so callers don't need to invalidate explicitly. Area summaries are driven by the indexed `pages` table, not by filesystem listing — the source of truth for the digest is what's queryable, not what's on disk. Flat-rooted pages (no slash) are ignored: a top-level page is not an area. Also: hook Reindex Phase 4 into recents.remove() so pages that vanish via raw-filesystem delete + reindex (common after `git pull` in sync) don't linger in the LRU as 404 candidates. With this hook, the renderer can trust the LRU as-is — no filter, no purge — and the LRU stays consistent with `pages` at all times. Step 3 of the digest plan (mind-map/plans/digest). --- internal/wiki/cloud.go | 13 ++ internal/wiki/digest.go | 296 ++++++++++++++++++++++++++++++++++ internal/wiki/digest_test.go | 297 +++++++++++++++++++++++++++++++++++ internal/wiki/index.go | 7 + internal/wiki/recents.go | 22 +++ internal/wiki/wiki.go | 8 + 6 files changed, 643 insertions(+) create mode 100644 internal/wiki/digest.go create mode 100644 internal/wiki/digest_test.go diff --git a/internal/wiki/cloud.go b/internal/wiki/cloud.go index a6f362e..3075ade 100644 --- a/internal/wiki/cloud.go +++ b/internal/wiki/cloud.go @@ -255,6 +255,10 @@ type cloudCache struct { // distinguish "no cloud yet" (cold start) from "cloud is empty" // (truly empty wiki) by checking set. set bool + // version is bumped on each Set. The digest cache uses it as a + // change signal so it can invalidate rendered output without + // re-comparing slices. + version uint64 } func (c *cloudCache) Set(terms []CloudTerm) { @@ -265,6 +269,7 @@ func (c *cloudCache) Set(terms []CloudTerm) { copy(cp, terms) c.terms = cp c.set = true + c.version++ } // Get returns a copy of the current cloud and whether one has been @@ -279,3 +284,11 @@ func (c *cloudCache) Get() ([]CloudTerm, bool) { copy(cp, c.terms) return cp, true } + +// Version returns the monotonic change counter. Pairs with +// recentsLRU.version() for digest cache invalidation. +func (c *cloudCache) Version() uint64 { + c.mu.RLock() + defer c.mu.RUnlock() + return c.version +} diff --git a/internal/wiki/digest.go b/internal/wiki/digest.go new file mode 100644 index 0000000..e61da64 --- /dev/null +++ b/internal/wiki/digest.go @@ -0,0 +1,296 @@ +package wiki + +import ( + "context" + "fmt" + "sort" + "strings" + "sync" +) + +// AreaSummary is one entry under `## Areas` in the rendered digest: +// a top-level directory, how many pages live under it, and (if the +// directory has an `index.md`) that index page's title as a one-line +// description. +type AreaSummary struct { + Path string `json:"path"` + PageCount int `json:"page_count"` + IndexTitle string `json:"index_title,omitempty"` +} + +// Digest is the structured form of the per-conversation orientation +// blob. The MCP `get_wiki_digest` tool and HTTP `/api/digest` endpoint +// return this — the markdown is what an LLM consumes; the typed fields +// let the WebUI render its own views (e.g. a word-cloud widget) without +// re-parsing the markdown. +type Digest struct { + PageCount int `json:"page_count"` + Cloud []CloudTerm `json:"cloud_terms"` + Recents []string `json:"recents"` + Areas []AreaSummary `json:"areas"` + Markdown string `json:"markdown"` +} + +// defaultMaxRenderBytes is the soft cap on the rendered markdown. +// Trim order when over: recents -> cloud -> areas (never). Matches +// the plan's ~4 KB target. Tunable via config (Step 7). +const defaultMaxRenderBytes = 4096 + +// digestCache is a single-slot cache for the rendered Digest, keyed +// by the (cloud version, recents seq) tuple at render time. The +// digest itself is a few-hundred-byte structure; what we're saving +// is the SQL roundtrip for area counts and the render loop, not the +// allocation. +type digestCache struct { + mu sync.Mutex + cloudVer uint64 + recentsSeq uint64 + pageCount int + cached *Digest +} + +// get returns the cached digest if (cloudVer, recentsSeq, pageCount) +// match the supplied values. pageCount is part of the key because a +// page added or removed without touching the LRU (rare, but happens +// on reindex for pure-content-change pages) still changes the header +// sentence ("This wiki contains N pages..."). +// +// Returns (nil, false) on a miss. +func (c *digestCache) get(cloudVer, recentsSeq uint64, pageCount int) (*Digest, bool) { + c.mu.Lock() + defer c.mu.Unlock() + if c.cached == nil { + return nil, false + } + if c.cloudVer != cloudVer || c.recentsSeq != recentsSeq || c.pageCount != pageCount { + return nil, false + } + return c.cached, true +} + +func (c *digestCache) set(cloudVer, recentsSeq uint64, pageCount int, d *Digest) { + c.mu.Lock() + defer c.mu.Unlock() + c.cloudVer = cloudVer + c.recentsSeq = recentsSeq + c.pageCount = pageCount + c.cached = d +} + +// invalidate clears the cache. Used in tests and on schema rebuilds +// (Step 4); CRUD doesn't need to call this because version bumps +// already cover the cache invalidation contract. +func (c *digestCache) invalidate() { + c.mu.Lock() + defer c.mu.Unlock() + c.cached = nil +} + +// Digest returns the current orientation digest. Cheap on cache hit; +// on miss, builds in O(pages) for the area counts and O(K) for the +// render. Safe for concurrent callers. +// +// This is the function HTTP `/api/digest` and the MCP `get_wiki_digest` +// tool call. It is also called transitively from the existing +// `get_wiki_context` (see Step 5) so old clients see the new data +// shape without breakage. +func (w *Wiki) Digest(ctx context.Context) (*Digest, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + + pageCount, err := w.pageCount(ctx) + if err != nil { + return nil, fmt.Errorf("digest page count: %w", err) + } + + cloudVer := w.cloud.Version() + recentsSeq := w.recents.version() + + if d, ok := w.digest.get(cloudVer, recentsSeq, pageCount); ok { + return d, nil + } + + areas, err := w.areaSummaries(ctx) + if err != nil { + return nil, fmt.Errorf("digest areas: %w", err) + } + + cloudTerms, _ := w.cloud.Get() // ok == false → empty, render copes + recents := w.recents.snapshot() + + d := &Digest{ + PageCount: pageCount, + Cloud: cloudTerms, + Recents: recents, + Areas: areas, + } + d.Markdown = renderDigestMarkdown(d, defaultMaxRenderBytes) + + w.digest.set(cloudVer, recentsSeq, pageCount, d) + return d, nil +} + +// pageCount runs the same SELECT COUNT(*) the Context handler uses. +// Lifted into a helper so Digest can share it. +func (w *Wiki) pageCount(ctx context.Context) (int, error) { + var n int + if err := w.db.QueryRowContext(ctx, "SELECT COUNT(*) FROM pages").Scan(&n); err != nil { + return 0, err + } + return n, nil +} + +// areaSummaries returns one entry per top-level directory in the wiki, +// with the page count and (if the directory has `/index.md`) +// the title of that index page. Sorted by descending page count, then +// by name — same shape as the rendered markdown. +// +// An area with no pages under it cannot exist (the source data is the +// `pages` table; empty dirs aren't tracked). A flat-rooted page like +// "readme" with no slash is not an area; only paths containing `/` +// contribute. This matches what topLevelDirs() exposes via filesystem +// listing — the two should agree, but areaSummaries is the source of +// truth for the digest because it's driven by indexed content, not +// filesystem state. +func (w *Wiki) areaSummaries(ctx context.Context) ([]AreaSummary, error) { + rows, err := w.db.QueryContext(ctx, "SELECT path, title FROM pages") + if err != nil { + return nil, err + } + defer rows.Close() + + type acc struct { + count int + indexTitle string + } + bucket := map[string]*acc{} + + for rows.Next() { + var path, title string + if err := rows.Scan(&path, &title); err != nil { + continue + } + slash := strings.IndexByte(path, '/') + if slash < 0 { + continue // flat-rooted, not an area + } + area := path[:slash] + a, ok := bucket[area] + if !ok { + a = &acc{} + bucket[area] = a + } + a.count++ + // The area's index page is `/index`. Record its title + // once; if for some reason there are multiple (shouldn't be, + // PRIMARY KEY on path prevents it), the last one wins. + if path == area+"/index" { + a.indexTitle = title + } + } + if err := rows.Err(); err != nil { + return nil, err + } + + out := make([]AreaSummary, 0, len(bucket)) + for name, a := range bucket { + out = append(out, AreaSummary{ + Path: name, + PageCount: a.count, + IndexTitle: a.indexTitle, + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].PageCount != out[j].PageCount { + return out[i].PageCount > out[j].PageCount + } + return out[i].Path < out[j].Path + }) + return out, nil +} + +// renderDigestMarkdown produces the markdown blob shown to LLMs. The +// shape mirrors the example in the plan; ordering of sections is +// header -> cloud line -> Areas -> Recently active -> footer. +// +// When the assembled body exceeds maxBytes the renderer trims: +// 1. drop recents from the tail until under cap, then +// 2. drop cloud terms from the tail until under cap. +// +// Areas are never trimmed — they are the smallest section and the +// most structurally important: an agent that loses the area list +// loses the map of the wiki. The footer hint is also preserved. +// +// If maxBytes <= 0 no trimming is applied. Useful for tests that want +// to verify full content. +func renderDigestMarkdown(d *Digest, maxBytes int) string { + cloud := d.Cloud + recents := d.Recents + + for { + var sb strings.Builder + writeDigestBody(&sb, d.PageCount, cloud, d.Areas, recents) + out := sb.String() + if maxBytes <= 0 || len(out) <= maxBytes { + return out + } + // Trim recents first. + if len(recents) > 0 { + recents = recents[:len(recents)-1] + continue + } + // Then trim cloud. + if len(cloud) > 0 { + cloud = cloud[:len(cloud)-1] + continue + } + // Already minimal; return what we have, even if over cap. + // Areas + header alone exceeding 4 KB would require a + // wiki with hundreds of top-level dirs — unlikely, but + // truncating areas would be a worse failure mode. + return out + } +} + +func writeDigestBody(sb *strings.Builder, pageCount int, cloud []CloudTerm, areas []AreaSummary, recents []string) { + areaCount := len(areas) + if areaCount == 1 { + fmt.Fprintf(sb, "This wiki contains %d pages across 1 area.", pageCount) + } else { + fmt.Fprintf(sb, "This wiki contains %d pages across %d areas.", pageCount, areaCount) + } + + if len(cloud) > 0 { + sb.WriteString(" About:\n") + for i, t := range cloud { + if i > 0 { + sb.WriteString(", ") + } + sb.WriteString(t.Term) + } + sb.WriteString("\n") + } else { + sb.WriteString("\n") + } + + if len(areas) > 0 { + sb.WriteString("\n## Areas\n") + for _, a := range areas { + fmt.Fprintf(sb, "- %s (%d)", a.Path, a.PageCount) + if a.IndexTitle != "" { + fmt.Fprintf(sb, " — %s/index: %q", a.Path, a.IndexTitle) + } + sb.WriteString("\n") + } + } + + if len(recents) > 0 { + sb.WriteString("\n## Recently active\n") + for _, p := range recents { + fmt.Fprintf(sb, "- %s\n", p) + } + } + + sb.WriteString("\nFull skill: SKILL.md. Use `get_wiki_digest` for the live version.\n") +} diff --git a/internal/wiki/digest_test.go b/internal/wiki/digest_test.go new file mode 100644 index 0000000..1f2ab37 --- /dev/null +++ b/internal/wiki/digest_test.go @@ -0,0 +1,297 @@ +package wiki + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestDigest_StructuralFields(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + if d.PageCount == 0 { + t.Fatal("page count should be > 0") + } + if d.Markdown == "" { + t.Fatal("markdown should not be empty") + } + // testWiki creates pages under projects/ and people/ — at least + // two areas should surface. + if len(d.Areas) < 2 { + t.Fatalf("expected >= 2 areas, got %d: %v", len(d.Areas), d.Areas) + } + // Cloud is empty because the ticker hasn't run yet (cold start). + // That's the expected behavior; the digest should still render. + if d.Cloud != nil && len(d.Cloud) != 0 { + t.Fatalf("cloud should be empty on cold start, got %v", d.Cloud) + } +} + +func TestDigest_MarkdownShape(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Seed cloud so we exercise the "About:" line too. + w.cloud.Set([]CloudTerm{ + {Term: "wiki", Count: 10}, + {Term: "mind-map", Count: 7}, + }) + // Seed recents. + w.recents.touch("projects/mind-map") + w.recents.touch("index") + // Bust the digest cache because we mutated state directly. + w.digest.invalidate() + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + md := d.Markdown + t.Logf("rendered:\n%s", md) + + mustContain := []string{ + "This wiki contains", + "About:", + "wiki, mind-map", + "## Areas", + "## Recently active", + "- index", + "- projects/mind-map", + "Full skill: SKILL.md", + "get_wiki_digest", + } + for _, s := range mustContain { + if !strings.Contains(md, s) { + t.Errorf("markdown missing %q\n---\n%s\n---", s, md) + } + } +} + +func TestDigest_AreaCountsAndIndexTitle(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Add an index page under "projects" with a known title. + if err := w.CreatePage(ctx, "projects/index", `--- +title: Active Projects +--- +# Active Projects +`); err != nil { + t.Fatalf("create projects/index: %v", err) + } + + d, err := w.Digest(ctx) + if err != nil { + t.Fatalf("Digest: %v", err) + } + + var found *AreaSummary + for i := range d.Areas { + if d.Areas[i].Path == "projects" { + found = &d.Areas[i] + break + } + } + if found == nil { + t.Fatalf("projects area missing: %+v", d.Areas) + } + if found.IndexTitle != "Active Projects" { + t.Errorf("expected index title 'Active Projects', got %q", found.IndexTitle) + } + if found.PageCount < 2 { + t.Errorf("projects should have >=2 pages (mind-map + index), got %d", found.PageCount) + } + + // The rendered area line should include the index title quoted. + if !strings.Contains(d.Markdown, `projects/index: "Active Projects"`) { + t.Errorf("markdown missing index title:\n%s", d.Markdown) + } +} + +func TestDigest_CacheHit(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // First call populates the cache. + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first Digest: %v", err) + } + + // Second call with no state change returns the *same* pointer + // (the cache stores the *Digest; a hit returns it as-is). + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second Digest: %v", err) + } + if first != second { + t.Errorf("expected cache hit to return same *Digest pointer") + } +} + +func TestDigest_CacheInvalidatedByLRUChange(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first: %v", err) + } + + // Touching the LRU bumps recents seq → cache miss next read. + w.recents.touch("index") + + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second: %v", err) + } + if first == second { + t.Errorf("expected fresh *Digest after recents change") + } + if !strings.Contains(second.Markdown, "- index") { + t.Errorf("new recents not reflected in markdown:\n%s", second.Markdown) + } +} + +func TestDigest_CacheInvalidatedByCloudChange(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + first, err := w.Digest(ctx) + if err != nil { + t.Fatalf("first: %v", err) + } + + w.cloud.Set([]CloudTerm{{Term: "wiki", Count: 1}}) + + second, err := w.Digest(ctx) + if err != nil { + t.Fatalf("second: %v", err) + } + if first == second { + t.Errorf("expected fresh *Digest after cloud set") + } + if !strings.Contains(second.Markdown, "About:") { + t.Errorf("cloud not reflected in markdown:\n%s", second.Markdown) + } +} + +func TestRenderDigest_TrimToMaxBytes(t *testing.T) { + // Build a digest that's deliberately over-cap. + cloud := make([]CloudTerm, 50) + for i := range cloud { + cloud[i] = CloudTerm{Term: strings.Repeat("x", 20), Count: 1} + } + recents := make([]string, 50) + for i := range recents { + recents[i] = strings.Repeat("path", 20) + } + d := &Digest{ + PageCount: 100, + Cloud: cloud, + Recents: recents, + Areas: []AreaSummary{{Path: "a", PageCount: 5}}, + } + + const cap = 512 + md := renderDigestMarkdown(d, cap) + if len(md) > cap { + // The trimmer is best-effort: if the unavoidable parts + // (areas + header + footer) already exceed cap we accept + // being over. But in this test those are tiny, so we + // should be under. + t.Errorf("rendered len=%d > cap=%d", len(md), cap) + } + // Areas + header + footer must still be intact. + mustContain := []string{"## Areas", "- a (5)", "Full skill"} + for _, s := range mustContain { + if !strings.Contains(md, s) { + t.Errorf("trim dropped required section %q:\n%s", s, md) + } + } +} + +func TestRenderDigest_NoCloudNoRecents(t *testing.T) { + d := &Digest{ + PageCount: 3, + Areas: []AreaSummary{ + {Path: "notes", PageCount: 3}, + }, + } + md := renderDigestMarkdown(d, 0) + if strings.Contains(md, "About:") { + t.Errorf("empty cloud should not produce About: line:\n%s", md) + } + if strings.Contains(md, "## Recently active") { + t.Errorf("empty recents should not produce section:\n%s", md) + } + if !strings.Contains(md, "## Areas") || !strings.Contains(md, "- notes (3)") { + t.Errorf("areas missing:\n%s", md) + } +} + +func TestAreaSummaries_FlatRootedPagesIgnored(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // `index` is flat-rooted; should not produce an "index" area. + areas, err := w.areaSummaries(ctx) + if err != nil { + t.Fatalf("areaSummaries: %v", err) + } + for _, a := range areas { + if a.Path == "index" { + t.Fatalf("flat-rooted page leaked into areas: %+v", areas) + } + } +} + +func TestReindex_RemovesFromLRU(t *testing.T) { + w, dir := testWiki(t) + defer w.Close() + ctx := context.Background() + + // Touch and verify presence. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + found := false + for _, p := range w.recents.snapshot() { + if p == "index" { + found = true + } + } + if !found { + t.Fatalf("index should be in LRU after GetPage") + } + + // Raw-filesystem delete + reindex (simulating sync removing a file). + if err := os.Remove(filepath.Join(dir, "index.md")); err != nil { + t.Fatalf("remove file: %v", err) + } + if _, err := w.Reindex(ctx); err != nil { + t.Fatalf("reindex: %v", err) + } + + for _, p := range w.recents.snapshot() { + if p == "index" { + t.Fatalf("reindex should have purged stale LRU entry: %v", w.recents.snapshot()) + } + } +} diff --git a/internal/wiki/index.go b/internal/wiki/index.go index e847f92..0379dc6 100644 --- a/internal/wiki/index.go +++ b/internal/wiki/index.go @@ -159,6 +159,13 @@ func (w *Wiki) Reindex(ctx context.Context) (ReindexStats, error) { slog.Warn("reindex remove error", slog.String("page", pagePath), slog.Any("error", err)) continue } + // Keep the recents LRU consistent with `pages`: a page + // that vanishes via raw-filesystem delete + reindex + // (common after `git pull` in sync) must drop from the + // LRU here, since DeletePage() was never called. Without + // this hook the digest's "recently active" can point at + // a 404. + w.recents.remove(pagePath) removed++ } } diff --git a/internal/wiki/recents.go b/internal/wiki/recents.go index 2e70226..0707e26 100644 --- a/internal/wiki/recents.go +++ b/internal/wiki/recents.go @@ -29,6 +29,12 @@ type recentsLRU struct { // dirty is true when the in-memory state has diverged from the last // persisted snapshot. The persistence ticker reads + clears it. dirty bool + // seq is a monotonic counter bumped on every state-changing + // operation (touch / remove / rename / load). The digest cache + // uses it as a cheap "did anything change?" signal so it can + // invalidate rendered output without re-comparing snapshots. + // Wraps at uint64 max — irrelevant in practice. + seq uint64 } // newRecentsLRU constructs an empty LRU with the given capacity. @@ -60,6 +66,7 @@ func (r *recentsLRU) touch(path string) { if elem, ok := r.idx[path]; ok { r.ll.MoveToFront(elem) r.dirty = true + r.seq++ return } elem := r.ll.PushFront(path) @@ -72,6 +79,7 @@ func (r *recentsLRU) touch(path string) { } } r.dirty = true + r.seq++ } // remove drops a path from the ring. Called when a page is deleted; @@ -91,6 +99,7 @@ func (r *recentsLRU) remove(path string) { r.ll.Remove(elem) delete(r.idx, path) r.dirty = true + r.seq++ } // rename relabels an entry in place, preserving its position in the @@ -138,6 +147,7 @@ func (r *recentsLRU) rename(from, to string) { } } r.dirty = true + r.seq++ } // snapshot returns the tracked paths, most recent first. The returned @@ -177,6 +187,18 @@ func (r *recentsLRU) load(paths []string) { } } r.dirty = false + r.seq++ +} + +// version returns the monotonic change counter. The digest cache uses +// this as an invalidation signal: cache the rendered output keyed by +// (cloudVersion, recentsVersion), and rebuild when either advances. +// +// Cheap (one lock + load) so callers can invoke it on every read. +func (r *recentsLRU) version() uint64 { + r.mu.Lock() + defer r.mu.Unlock() + return r.seq } // takeDirty returns whether the ring has unsaved changes and clears diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index 967e8f4..0806fec 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -59,6 +59,12 @@ type Wiki struct { // recents.go for the rationale (intent vs. disk mtime). Persistence // to SQLite is layered on in state.go; here it just lives in memory. recents *recentsLRU + // cloud holds the most recent word/phrase cloud rebuild. Populated + // by the 5-minute ticker (Step 6); cold start renders without it. + cloud *cloudCache + // digest caches the rendered markdown blob, invalidated by cloud + // version + recents seq changes. See digest.go. + digest *digestCache } // Open opens (or creates) a wiki rooted at the given directory. @@ -91,6 +97,8 @@ func Open(root string) (*Wiki, error) { // for a config-driven value (digest.recents_size); the default // keeps existing callers unaffected. recents: newRecentsLRU(20), + cloud: &cloudCache{}, + digest: &digestCache{}, } if err := w.initSchema(); err != nil { db.Close() From 5712fe11ffdd5479596a972ccd656fbad99e9c07 Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 22:55:58 -0700 Subject: [PATCH 4/8] feat(wiki): persist recents and cloud across restarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A wiki_state table (key/value/updated) stores the LRU snapshot and the word/phrase cloud in JSON so a freshly-restarted server has a useful digest immediately, not after the first 5-minute ticker fires. The rendered digest markdown is NOT persisted — it's sub-ms to re-assemble from cloud + LRU, and the in-memory digestCache already handles 'don't re-format on every hit'. Adding a third write path buys nothing measurable. Load happens at the tail of Open(), after Reindex. Persisted recents are filtered against the current `pages` table so paths that vanished while the server was off (deleted on disk, or sync-pulled away) don't reappear in the LRU as 404 candidates. The cloud loads as-is — global frequency counts remain a reasonable approximation across small content changes, and the next rebuild ticker (Step 6) will refresh it within minutes. Save points: - persistRecents() — called by Close() for a clean shutdown flush and (in Step 6) on a 30s dirty-gated ticker. - persistCloud() — called by Step 6's 5m rebuild ticker. No-ops when the cloud has never been populated so we don't clobber a previously-good copy with an empty placeholder. Failure modes are deliberately lenient: a corrupt JSON row, a missing table, or an unreachable column logs at WARN and falls back to fresh-wiki state rather than panicking. The digest is an orientation signal, not a correctness boundary; losing it shouldn't take down the server. Also: made Close() idempotent via sync.Once. testWiki's t.Cleanup plus explicit defer Close in state tests would otherwise run the persistRecents flush against an already-closed DB. Step 4 of the digest plan (mind-map/plans/digest). --- internal/wiki/state.go | 198 ++++++++++++++++++++++++++++++++++++ internal/wiki/state_test.go | 171 +++++++++++++++++++++++++++++++ internal/wiki/wiki.go | 37 ++++++- 3 files changed, 403 insertions(+), 3 deletions(-) create mode 100644 internal/wiki/state.go create mode 100644 internal/wiki/state_test.go diff --git a/internal/wiki/state.go b/internal/wiki/state.go new file mode 100644 index 0000000..a621415 --- /dev/null +++ b/internal/wiki/state.go @@ -0,0 +1,198 @@ +package wiki + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "log/slog" + "time" +) + +// wiki_state schema: a small key/value table for cross-restart persistence +// of derived structures (recents LRU, word/phrase cloud). Distinct from +// the `pages` index, which is rebuildable from disk — wiki_state holds +// signals that *can't* be recovered from the markdown files alone: +// +// - "recent_lru" — the active-use ring (intent, not mtime). Lost on +// restart without persistence; that's exactly the case the digest +// plan is designed to avoid. +// - "cloud" — the word/phrase cloud is rebuildable but expensive +// (one full table scan + tokenization). Persisting it means a +// freshly-restarted server has a digest immediately, not after +// the first ticker tick (up to 5 minutes later). +// +// We intentionally do NOT persist the rendered digest markdown: it's +// sub-ms to re-assemble from cloud + LRU, and the in-memory +// digestCache already covers "don't re-format on every hit". + +const ( + stateKeyRecentLRU = "recent_lru" + stateKeyCloud = "cloud" +) + +// initStateSchema creates the wiki_state table. Called from initSchema. +// Idempotent. +func (w *Wiki) initStateSchema() error { + _, err := w.db.Exec(` + CREATE TABLE IF NOT EXISTS wiki_state ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated TEXT NOT NULL + );`) + return err +} + +// recentsState is the on-disk shape of the persisted LRU. Stored as a +// JSON document under wiki_state["recent_lru"].value. Items are listed +// most-recent-first, matching recentsLRU.snapshot(). +type recentsState struct { + Items []string `json:"items"` +} + +// cloudState is the on-disk shape of the persisted cloud. +type cloudState struct { + Terms []CloudTerm `json:"terms"` +} + +// loadState pulls the persisted LRU + cloud out of wiki_state into +// memory. Called once at the end of Open(), after Reindex. Failures +// are logged but non-fatal — a missing or corrupt row just means the +// process starts with an empty signal, which is the same state a +// brand-new wiki ships with. +func (w *Wiki) loadState(ctx context.Context) { + if items, ok := w.readStateKey(ctx, stateKeyRecentLRU); ok { + var s recentsState + if err := json.Unmarshal([]byte(items), &s); err != nil { + slog.Warn("wiki_state recent_lru parse failed", slog.Any("error", err)) + } else { + // Filter against the current index so paths that vanished + // while the server was off (deleted, renamed via raw + // filesystem, or sync-pulled away) don't reappear in the + // LRU as 404 candidates. Reindex has already run by this + // point, so `pages` is the authoritative set. + filtered := w.filterAgainstIndex(ctx, s.Items) + w.recents.load(filtered) + slog.Info("recents loaded from wiki_state", + slog.Int("persisted", len(s.Items)), + slog.Int("kept", len(filtered)), + ) + } + } + + if terms, ok := w.readStateKey(ctx, stateKeyCloud); ok { + var s cloudState + if err := json.Unmarshal([]byte(terms), &s); err != nil { + slog.Warn("wiki_state cloud parse failed", slog.Any("error", err)) + } else { + // Use the persisted cloud as-is. The cloud is global + // frequency counts, not per-page references — even if + // some pages have vanished the previous distribution + // is still a reasonable approximation until the next + // rebuild ticker fires (default: within 5 minutes of + // startup). + w.cloud.Set(s.Terms) + slog.Info("cloud loaded from wiki_state", slog.Int("terms", len(s.Terms))) + } + } +} + +// filterAgainstIndex returns only those paths that currently exist in +// the `pages` table, preserving input order. Used on Open() to drop +// stale persisted recents whose underlying pages vanished while the +// server was off. +// +// One query: SELECT path FROM pages where path IN (...). We do it via +// a map probe rather than a SQL IN-clause because (a) the input slice +// is small (~20 entries by default) and (b) building a variable-length +// IN-clause with placeholders for SQLite is awkward. +func (w *Wiki) filterAgainstIndex(ctx context.Context, paths []string) []string { + if len(paths) == 0 { + return nil + } + rows, err := w.db.QueryContext(ctx, "SELECT path FROM pages") + if err != nil { + slog.Warn("filterAgainstIndex query failed", slog.Any("error", err)) + return paths // fail open: keep all, let the next CRUD reconcile + } + defer rows.Close() + present := make(map[string]struct{}) + for rows.Next() { + var p string + if rows.Scan(&p) == nil { + present[p] = struct{}{} + } + } + out := make([]string, 0, len(paths)) + for _, p := range paths { + if _, ok := present[p]; ok { + out = append(out, p) + } + } + return out +} + +// readStateKey returns the value for a wiki_state key, or "", false if +// not present or the read failed. Read errors other than "no row" are +// logged so a real DB problem doesn't silently degrade the digest. +func (w *Wiki) readStateKey(ctx context.Context, key string) (string, bool) { + var value string + err := w.db.QueryRowContext(ctx, "SELECT value FROM wiki_state WHERE key = ?", key).Scan(&value) + if err == nil { + return value, true + } + // sql.ErrNoRows is the common case (first run on a wiki) — silent. + if errors.Is(err, sql.ErrNoRows) { + return "", false + } + slog.Warn("wiki_state read failed", slog.String("key", key), slog.Any("error", err)) + return "", false +} + +// writeStateKey upserts a wiki_state row. The (key, value, updated) +// triple is atomic via INSERT OR REPLACE — readers either see the old +// or the new value, never a torn write. +func (w *Wiki) writeStateKey(ctx context.Context, key, value string) error { + now := time.Now().UTC().Format(time.RFC3339Nano) + _, err := w.db.ExecContext(ctx, + "INSERT OR REPLACE INTO wiki_state (key, value, updated) VALUES (?, ?, ?)", + key, value, now, + ) + return err +} + +// persistRecents writes the current LRU snapshot to wiki_state. Called +// by the 30s persistence ticker (Step 6) and from Close() for a clean +// shutdown. Safe to call concurrently with reads — the LRU snapshot is +// taken under its own lock and the SQLite write is atomic. +// +// If the LRU's dirty flag is unset, this is still safe to call (we'll +// rewrite the same bytes); callers wanting to skip a redundant write +// should gate on takeDirty() before calling. +func (w *Wiki) persistRecents(ctx context.Context) error { + state := recentsState{Items: w.recents.snapshot()} + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("marshal recents: %w", err) + } + return w.writeStateKey(ctx, stateKeyRecentLRU, string(data)) +} + +// persistCloud writes the current cloud cache to wiki_state. Called +// after a successful rebuild (Step 6). No-ops if the cloud has never +// been populated — there's nothing meaningful to write yet, and we +// don't want to clobber a previously-good persisted copy with an +// empty placeholder. +func (w *Wiki) persistCloud(ctx context.Context) error { + terms, ok := w.cloud.Get() + if !ok { + return nil + } + state := cloudState{Terms: terms} + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("marshal cloud: %w", err) + } + return w.writeStateKey(ctx, stateKeyCloud, string(data)) +} diff --git a/internal/wiki/state_test.go b/internal/wiki/state_test.go new file mode 100644 index 0000000..59e809c --- /dev/null +++ b/internal/wiki/state_test.go @@ -0,0 +1,171 @@ +package wiki + +import ( + "context" + "reflect" + "testing" +) + +func TestState_PersistAndLoadRecents(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Touch a few pages, then close the wiki — Close() flushes the LRU. + if _, err := w.GetPage(ctx, "projects/mind-map"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if _, err := w.GetPage(ctx, "people/alice"); err != nil { + t.Fatalf("GetPage: %v", err) + } + beforeClose := w.recents.snapshot() + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen the same wiki directory; the LRU should rehydrate. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + got := w2.recents.snapshot() + if !reflect.DeepEqual(got, beforeClose) { + t.Fatalf("LRU not restored:\n before: %v\n after: %v", beforeClose, got) + } +} + +func TestState_PersistAndLoadCloud(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Seed and persist the cloud directly (the ticker isn't running + // in tests; Step 6 owns that wiring). + terms := []CloudTerm{ + {Term: "wiki", Count: 5}, + {Term: "mind-map", Count: 3}, + } + w.cloud.Set(terms) + if err := w.persistCloud(ctx); err != nil { + t.Fatalf("persistCloud: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + loaded, ok := w2.cloud.Get() + if !ok { + t.Fatalf("cloud not restored (ok=false)") + } + if !reflect.DeepEqual(loaded, terms) { + t.Fatalf("cloud roundtrip mismatch:\n before: %v\n after: %v", terms, loaded) + } +} + +func TestState_LoadFiltersStalePaths(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Touch a real page and a fake one. We can't get a fake into the + // LRU via Wiki methods (they validate), so use the LRU directly. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + w.recents.touch("ghost/page/that/does/not/exist") + + if err := w.persistRecents(ctx); err != nil { + t.Fatalf("persistRecents: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen; the ghost path should be dropped on load because it + // isn't in `pages`. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer w2.Close() + + for _, p := range w2.recents.snapshot() { + if p == "ghost/page/that/does/not/exist" { + t.Fatalf("stale path leaked through filter: %v", w2.recents.snapshot()) + } + } + // The real one survives. + found := false + for _, p := range w2.recents.snapshot() { + if p == "index" { + found = true + } + } + if !found { + t.Fatalf("real path dropped by filter: %v", w2.recents.snapshot()) + } +} + +func TestState_EmptyWikiNoErrors(t *testing.T) { + // A fresh wiki has no wiki_state rows. Open() must not error, + // and the LRU / cloud must be empty. + dir := t.TempDir() + w, err := Open(dir) + if err != nil { + t.Fatalf("Open empty wiki: %v", err) + } + defer w.Close() + + if w.recents.len() != 0 { + t.Fatalf("expected empty LRU on fresh wiki, got %v", w.recents.snapshot()) + } + if _, ok := w.cloud.Get(); ok { + t.Fatalf("expected unpopulated cloud on fresh wiki") + } +} + +func TestState_CorruptRecentsRowFallsBack(t *testing.T) { + w, dir := testWiki(t) + ctx := context.Background() + + // Inject a malformed JSON row directly. + if err := w.writeStateKey(ctx, stateKeyRecentLRU, "{not valid json"); err != nil { + t.Fatalf("writeStateKey: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + + // Reopen must not error; LRU should be empty (load failed silently). + // Close flushes the (just-emptied) LRU, so the corrupt row gets + // overwritten by a valid one on shutdown — that's also fine. + w2, err := Open(dir) + if err != nil { + t.Fatalf("reopen with corrupt row: %v", err) + } + defer w2.Close() + + if w2.recents.len() != 0 { + t.Fatalf("expected empty LRU after corrupt row; got %v", w2.recents.snapshot()) + } +} + +func TestState_PersistCloudNoOpWhenUnset(t *testing.T) { + w, _ := testWiki(t) + defer w.Close() + ctx := context.Background() + + // cloud has never been Set on this wiki; persisting must not + // write a placeholder (would clobber a previously-good copy). + if err := w.persistCloud(ctx); err != nil { + t.Fatalf("persistCloud unset: %v", err) + } + if _, ok := w.readStateKey(ctx, stateKeyCloud); ok { + t.Fatalf("expected no wiki_state[cloud] row when cloud is unset") + } +} diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index 0806fec..98f35de 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -13,6 +13,7 @@ import ( "log/slog" "os" "path/filepath" + "sync" "time" _ "modernc.org/sqlite" // pure-Go SQLite driver (no CGO required) @@ -65,6 +66,12 @@ type Wiki struct { // digest caches the rendered markdown blob, invalidated by cloud // version + recents seq changes. See digest.go. digest *digestCache + // closed guards Close() against double-invocation: testWiki and + // other callers commonly stack defer Close on top of t.Cleanup. + // Without this guard, the second Close() runs persistRecents + // against an already-closed DB and logs a spurious warning. + closeOnce sync.Once + closeErr error } // Open opens (or creates) a wiki rooted at the given directory. @@ -117,15 +124,35 @@ func Open(root string) (*Wiki, error) { return nil, fmt.Errorf("initial index: %w", err) } + // Load persisted derived state (recents LRU, word cloud) after + // reindex so any stale entries pointing at pages that vanished + // while the server was off get filtered against the fresh index. + // Failures are logged but non-fatal — a corrupt state row just + // degrades to "fresh-wiki" behavior, not a crash. + w.loadState(context.Background()) + slog.Info("wiki opened", slog.String("root", absRoot)) return w, nil } // Close releases page locks held by this session and closes the database. +// Idempotent — safe to call multiple times (e.g. when a test stacks +// defer Close on top of testWiki's t.Cleanup). func (w *Wiki) Close() error { - slog.Info("wiki closing", slog.String("root", w.root)) - w.db.Exec("DELETE FROM page_locks WHERE holder = ?", w.sessionID) - return w.db.Close() + w.closeOnce.Do(func() { + slog.Info("wiki closing", slog.String("root", w.root)) + // Flush the LRU one last time so a clean shutdown doesn't + // lose the last ~30 seconds of touches between ticker fires. + // Errors are logged, not propagated — we'd rather close + // cleanly with a slightly stale snapshot than leak the DB + // handle. + if err := w.persistRecents(context.Background()); err != nil { + slog.Warn("recents flush on close failed", slog.Any("error", err)) + } + w.db.Exec("DELETE FROM page_locks WHERE holder = ?", w.sessionID) + w.closeErr = w.db.Close() + }) + return w.closeErr } // Root returns the wiki's root directory. @@ -185,6 +212,10 @@ func (w *Wiki) initSchema() error { return err } + if err := w.initStateSchema(); err != nil { + return fmt.Errorf("wiki_state schema: %w", err) + } + // Clean up stale locks (older than 5 minutes) from crashed processes _, err := w.db.Exec("DELETE FROM page_locks WHERE acquired < ?", time.Now().Add(-5*time.Minute).UTC().Format(time.RFC3339)) From 8ac8ad05a62a82e828f7112f378f1b3a8dfd98fe Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 23:12:55 -0700 Subject: [PATCH 5/8] feat: expose the digest via MCP and HTTP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three surfaces, one signal: - MCP get_wiki_digest — new tool. Returns the structured Digest (page count, cloud terms, recents LRU, per-area summaries, rendered markdown). Tool description nudges agents to call it at the start of every conversation. - MCP get_wiki_context — the legacy {page_count, recent_pages, top_level_dirs} shape is preserved verbatim so existing clients (opencode, Claude Code in the wild, per plan open question #4) keep working. New fields (cloud_terms, recents, areas, markdown) are layered on the same response — old clients ignore them; new clients get the orientation upgrade without a tool-name change. - HTTP GET /api/digest — returns the full Digest as JSON. Intended for the WebUI (so it can render its own word-cloud or recents widgets off the structured fields rather than parsing the markdown) and for non-MCP scripts/tests. Implementation: WikiContext gets new optional fields (omitempty so the JSON shape is additive). Wiki.Context() delegates to Digest() to populate them; a digest failure logs at WARN but doesn't fail the Context call — the legacy fields are still valuable on their own, and the digest is an enhancement, not a contract. Step 5 of the digest plan (mind-map/plans/digest). --- internal/httpapi/server.go | 22 +++++++++++++++ internal/httpapi/server_test.go | 47 +++++++++++++++++++++++++++++++++ internal/mcp/server.go | 25 +++++++++++++++++- internal/mcp/server_test.go | 35 ++++++++++++++++++++++++ internal/wiki/pages.go | 26 +++++++++++++++--- internal/wiki/wiki.go | 31 +++++++++++++++++++++- 6 files changed, 181 insertions(+), 5 deletions(-) diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go index cf96774..8b4874a 100644 --- a/internal/httpapi/server.go +++ b/internal/httpapi/server.go @@ -161,6 +161,7 @@ func (s *Server) shutdown() { func (s *Server) register(mux *http.ServeMux) { mux.HandleFunc("GET /api/version", s.getVersion) mux.HandleFunc("GET /api/context", s.getContext) + mux.HandleFunc("GET /api/digest", s.getDigest) mux.HandleFunc("GET /api/pages", s.listPages) mux.HandleFunc("GET /api/pages/{path...}", s.getPage) mux.HandleFunc("POST /api/pages", s.createPage) @@ -306,6 +307,27 @@ func (s *Server) getContext(rw http.ResponseWriter, r *http.Request) { writeJSON(rw, wctx) } +// getDigest handles GET /api/digest. Returns the full Digest struct +// (page count, cloud terms, recents LRU, per-area summaries, rendered +// markdown). Intended for two callers: +// +// - Agents / MCP clients that prefer the HTTP path over the MCP +// tool (e.g. tests, scripts, or alternate clients). +// - The WebUI, which can render its own widgets (e.g. a word-cloud +// visualization) off the structured fields rather than parsing +// the markdown. +// +// Cheap on cache hit, sub-millisecond on miss. Safe to call frequently +// (e.g. WebUI polling); the in-memory digestCache absorbs the load. +func (s *Server) getDigest(rw http.ResponseWriter, r *http.Request) { + d, err := s.deps.Wiki.Digest(r.Context()) + if err != nil { + http.Error(rw, err.Error(), http.StatusInternalServerError) + return + } + writeJSON(rw, d) +} + func (s *Server) listPages(rw http.ResponseWriter, r *http.Request) { prefix := r.URL.Query().Get("prefix") pages, err := s.deps.Wiki.ListPages(r.Context(), prefix) diff --git a/internal/httpapi/server_test.go b/internal/httpapi/server_test.go index bbee42e..c08f747 100644 --- a/internal/httpapi/server_test.go +++ b/internal/httpapi/server_test.go @@ -358,3 +358,50 @@ func TestReindexDetectsDirectFilesystemChanges(t *testing.T) { t.Errorf("page still not indexed after reindex (got %d body=%s)", rec.Code, rec.Body.String()) } } + +func TestGetDigest(t *testing.T) { + h := newTestServer(t) + + // Seed a page so the digest has something to summarize. + rec := doJSON(t, h, "POST", "/api/pages", map[string]string{ + "path": "topics/sqlite", + "content": "# SQLite\n\nSQLite is a fast embedded database.\n", + }) + if rec.Code != 201 { + t.Fatalf("seed: %d %s", rec.Code, rec.Body.String()) + } + + rec = doJSON(t, h, "GET", "/api/digest", nil) + if rec.Code != 200 { + t.Fatalf("digest: %d %s", rec.Code, rec.Body.String()) + } + + var d wiki.Digest + if err := json.Unmarshal(rec.Body.Bytes(), &d); err != nil { + t.Fatalf("unmarshal: %v\n%s", err, rec.Body.String()) + } + + if d.PageCount < 1 { + t.Errorf("page count = %d, want >= 1", d.PageCount) + } + if d.Markdown == "" { + t.Errorf("markdown empty") + } + if !strings.Contains(d.Markdown, "This wiki contains") { + t.Errorf("markdown missing header sentence:\n%s", d.Markdown) + } + if len(d.Areas) == 0 { + t.Errorf("expected at least one area, got none") + } + // Recently active should include the page we just created + // (CreatePage touches the LRU). + found := false + for _, p := range d.Recents { + if p == "topics/sqlite" { + found = true + } + } + if !found { + t.Errorf("recents missing topics/sqlite: %v", d.Recents) + } +} diff --git a/internal/mcp/server.go b/internal/mcp/server.go index a58839c..86d622f 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -61,9 +61,14 @@ func (s *Server) registerTools() { mcp.AddTool(s.server, &mcp.Tool{ Name: "get_wiki_context", - Description: "Get wiki orientation: page count, top-level directories, and 20 most recently modified pages.", + Description: "Get wiki orientation: page count, top-level directories, and 20 most recently modified pages. Also returns the digest (cloud_terms, recents LRU, per-area counts, rendered markdown) for new clients — older clients can ignore the extra fields.", }, s.getWikiContext) + mcp.AddTool(s.server, &mcp.Tool{ + Name: "get_wiki_digest", + Description: "Get a compact, always-current per-conversation orientation of this wiki. Returns: a rendered markdown blob (suitable to paste into context), a word/phrase cloud across all page bodies (what this wiki is about), an LRU of pages the user or agent has actively touched (intent, not file-mtime), and per-area page counts. Call this at the start of every new conversation. Cheaper and more deterministic than searching blindly; complements search_pages once you know what to look for.", + }, s.getWikiDigest) + mcp.AddTool(s.server, &mcp.Tool{ Name: "get_page", Description: "Read a wiki page with parsed frontmatter, body, outgoing links, and backlinks.", @@ -175,6 +180,24 @@ func (s *Server) getWikiContext(ctx context.Context, _ *mcp.CallToolRequest, _ a return textResult(wctx) } +func (s *Server) getWikiDigest(ctx context.Context, _ *mcp.CallToolRequest, _ any) (*mcp.CallToolResult, any, error) { + start := time.Now() + d, err := s.wiki.Digest(ctx) + if err != nil { + slog.Error("tool.get_wiki_digest failed", slog.Any("error", err)) + return nil, nil, err + } + slog.Info("tool.get_wiki_digest", + slog.Int("page_count", d.PageCount), + slog.Int("cloud_terms", len(d.Cloud)), + slog.Int("recents", len(d.Recents)), + slog.Int("areas", len(d.Areas)), + slog.Int("bytes", len(d.Markdown)), + slog.Duration("elapsed", time.Since(start)), + ) + return textResult(d) +} + func (s *Server) getPage(ctx context.Context, _ *mcp.CallToolRequest, input pagePathInput) (*mcp.CallToolResult, any, error) { start := time.Now() page, err := s.wiki.GetPage(ctx, input.Path) diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index dd30bc8..8bd38b0 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -176,6 +176,7 @@ func TestListTools(t *testing.T) { expected := map[string]bool{ "search_pages": false, "get_wiki_context": false, + "get_wiki_digest": false, "get_page": false, "create_page": false, "update_page": false, @@ -207,6 +208,40 @@ func TestGetWikiContext(t *testing.T) { if ctx.PageCount != 4 { t.Errorf("PageCount = %d, want 4", ctx.PageCount) } + // New digest fields should be populated on the same response so + // existing get_wiki_context callers get the orientation upgrade + // for free (plan open question #4 — keep old shape, add fields). + if ctx.Markdown == "" { + t.Errorf("expected digest markdown to be populated on get_wiki_context") + } + if len(ctx.Areas) == 0 { + t.Errorf("expected areas to be populated on get_wiki_context") + } +} + +func TestGetWikiDigest(t *testing.T) { + session := setupTestServer(t) + text := callTool(t, session, "get_wiki_digest", nil) + + var d wiki.Digest + if err := json.Unmarshal([]byte(text), &d); err != nil { + t.Fatalf("unmarshal: %v\n%s", err, text) + } + if d.PageCount != 4 { + t.Errorf("PageCount = %d, want 4", d.PageCount) + } + if d.Markdown == "" { + t.Errorf("Markdown empty") + } + if !strings.Contains(d.Markdown, "This wiki contains") { + t.Errorf("markdown missing header sentence:\n%s", d.Markdown) + } + if !strings.Contains(d.Markdown, "## Areas") { + t.Errorf("markdown missing Areas section:\n%s", d.Markdown) + } + if len(d.Areas) == 0 { + t.Errorf("expected at least one area in structured output") + } } func TestGetPage(t *testing.T) { diff --git a/internal/wiki/pages.go b/internal/wiki/pages.go index 9e4de30..d030cbd 100644 --- a/internal/wiki/pages.go +++ b/internal/wiki/pages.go @@ -423,7 +423,13 @@ func (w *Wiki) AllLinks(ctx context.Context) ([]Link, error) { return links, nil } -// Context returns a WikiContext overview. +// Context returns a WikiContext overview. The legacy fields +// (PageCount, RecentPages, TopLevelDirs) come from disk — recent_pages +// is mtime-sorted, top_level_dirs is read from the filesystem — and +// preserve the shape clients in the wild already depend on. The new +// fields (Cloud, Recents, Areas, Markdown) come from the digest so +// existing get_wiki_context callers get the orientation upgrade +// without switching tool names. func (w *Wiki) Context(ctx context.Context) (*WikiContext, error) { if err := ctx.Err(); err != nil { return nil, err @@ -460,11 +466,25 @@ func (w *Wiki) Context(ctx context.Context) (*WikiContext, error) { // Top-level dirs dirs := w.topLevelDirs() - return &WikiContext{ + wctx := &WikiContext{ PageCount: count, RecentPages: recent, TopLevelDirs: dirs, - }, nil + } + + // Layer the digest's signals on top. A failure here doesn't fail + // the whole Context() call — the legacy fields are still valuable + // on their own, and the digest is an enhancement, not a contract. + if d, err := w.Digest(ctx); err == nil { + wctx.Cloud = d.Cloud + wctx.Recents = d.Recents + wctx.Areas = d.Areas + wctx.Markdown = d.Markdown + } else { + slog.Warn("context digest enrichment failed", slog.Any("error", err)) + } + + return wctx, nil } // --- locking --- diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index 98f35de..e73c3fe 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -44,11 +44,40 @@ type SearchResult struct { Snippet string `json:"snippet"` } -// WikiContext provides an overview of the wiki for orientation. +// WikiContext provides an overview of the wiki for orientation. The +// legacy fields (PageCount, RecentPages, TopLevelDirs) reflect disk +// state — recent_pages is sorted by file mtime, top_level_dirs is read +// from the filesystem — and remain available for clients that already +// depend on that shape (opencode, Claude Code in the wild, per the +// plan's open question #4). +// +// The newer fields (Cloud, Recents, Areas, Markdown) are the digest +// signals: cloud terms across all page bodies, the active-use LRU +// (intent, not mtime), per-area page counts pulled from the index, +// and the rendered markdown an LLM can use directly. New clients +// should prefer `get_wiki_digest` for these, but `get_wiki_context` +// returns them too so existing tool wiring still benefits from the +// orientation upgrade without a client change. type WikiContext struct { PageCount int `json:"page_count"` RecentPages []Page `json:"recent_pages"` TopLevelDirs []string `json:"top_level_dirs"` + + // Cloud is the top-K word/phrase cloud across all page bodies. + // Empty until the first ticker fires on a freshly-opened wiki. + Cloud []CloudTerm `json:"cloud_terms,omitempty"` + // Recents is the active-use LRU — paths the user/agent actually + // touched (Create/Update/Get/Move/GetBacklinks). Distinct from + // RecentPages which is mtime-based. + Recents []string `json:"recents,omitempty"` + // Areas is the per-top-level-directory page count + index title. + // Driven by the indexed `pages` table, not filesystem listing. + Areas []AreaSummary `json:"areas,omitempty"` + // Markdown is the rendered digest blob — the same string an LLM + // would consume from `get_wiki_digest`. Included here so the + // existing get_wiki_context call gives clients an upgrade path + // without a tool-name change. + Markdown string `json:"markdown,omitempty"` } // Wiki is the core engine. Create one with Open(). From 389e1142cd00127d13dfd573f48cff4dca7c82d9 Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sat, 23 May 2026 23:49:49 -0700 Subject: [PATCH 6/8] feat(digest): background tickers for cloud rebuild and recents flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new internal/digest.Manager mirrors internal/sync.Manager's shape: NewManager(*wiki.Wiki, Options) → Start(ctx) / Stop() lifecycle, with the embedder (cmd/mind-map) supervising. Sync's separation between storage engine and goroutine-owning supervisor is a good pattern; the digest follows it so cmd/mind-map sees a uniform 'subsystems are supervised, not implicit' model. Two tickers in one goroutine: - cloud_refresh (5m default): full cloud rebuild via Wiki.BuildCloud, SetCloud, PersistCloud. Synchronous first build on Start() so the very first post-open digest read has cloud terms — cold start over a 1k-page wiki is < 100ms. - recents_refresh (30s default): gated PersistRecents call. Skips SQLite writes on idle servers via a non-mutating peekDirty probe; only takeDirty (which clears the flag) runs after a successful write. Shutdown contract: Stop() cancels the loop's context, the loop runs one final detached-context flushRecents so the last ~30s of touches land on disk, then closes done. Idempotent via sync.Once on both Start and Stop. The pairing 'defer dm.Stop(); defer w.Close()' in cmd/mind-map ensures the ticker quiesces before the DB closes (prevents 'sql: database is closed' races during shutdown). Exposed helpers on *Wiki: - BuildCloud / SetCloud / PersistCloud — public entry points for the supervisor; the lowercase internals stay for tests. - PersistRecents — clears dirty only on a successful write so a failed persist retries on the next tick rather than dropping the diff silently. - RecentsDirty — read-only peek, used by the manager's gate. Wiring: both runStdio and runHTTPServer in cmd/mind-map start a manager after wiki.Open and Stop it before w.Close. The HTTP path derives the manager's context from stopCh so /api/restart and ctrl+C take down the tickers cleanly. The service-mode launcher delegates to runHTTPServer so it picks up the wiring for free. Step 6 of the digest plan (mind-map/plans/digest). --- cmd/mind-map/main.go | 29 +++++ internal/digest/manager.go | 201 ++++++++++++++++++++++++++++++++ internal/digest/manager_test.go | 139 ++++++++++++++++++++++ internal/wiki/cloud.go | 24 ++++ internal/wiki/recents.go | 10 ++ internal/wiki/state.go | 27 +++++ 6 files changed, 430 insertions(+) create mode 100644 internal/digest/manager.go create mode 100644 internal/digest/manager_test.go diff --git a/cmd/mind-map/main.go b/cmd/mind-map/main.go index 0090799..f141c7d 100644 --- a/cmd/mind-map/main.go +++ b/cmd/mind-map/main.go @@ -12,6 +12,7 @@ import ( "time" "github.com/aniongithub/mind-map/internal/config" + "github.com/aniongithub/mind-map/internal/digest" "github.com/aniongithub/mind-map/internal/httpapi" "github.com/aniongithub/mind-map/internal/logging" mindmcp "github.com/aniongithub/mind-map/internal/mcp" @@ -93,6 +94,14 @@ func runStdio(cmd *cobra.Command, args []string) error { } defer w.Close() + // Spin up the digest's background maintenance (cloud rebuild + + // recents flush) for the duration of the stdio session. Stop + // before Close so a mid-rebuild ticker doesn't race the DB + // shutdown. + dm := digest.NewManager(w, digest.Options{}) + dm.Start(cmd.Context()) + defer dm.Stop() + s := mindmcp.NewServer(w, nil, getVersion()) slog.Info("mind-map MCP server starting", slog.String("mode", "stdio"), slog.String("wiki", w.Root())) return s.MCPServer().Run(cmd.Context(), &mcpsdk.StdioTransport{}) @@ -151,6 +160,26 @@ func runHTTPServer(addr, dir, webuiDir string, idleTimeout time.Duration, stopCh } defer w.Close() + // Background digest maintenance runs for the lifetime of the + // HTTP server. We use a context derived from stopCh so that the + // graceful /api/restart path (which closes stopCh) also stops + // the tickers cleanly. Stopping before Close ensures the LRU + // flush in the manager's final tick doesn't race with db.Close. + dctx, dcancel := context.WithCancel(context.Background()) + defer dcancel() + go func() { + select { + case <-stopCh: + dcancel() + case <-dctx.Done(): + // Normal function return; the defer above cancelled us. + return + } + }() + dm := digest.NewManager(w, digest.Options{}) + dm.Start(dctx) + defer dm.Stop() + cfgPath := config.DefaultPath() cfg, err := config.Load(cfgPath) if err != nil { diff --git a/internal/digest/manager.go b/internal/digest/manager.go new file mode 100644 index 0000000..c6c0c10 --- /dev/null +++ b/internal/digest/manager.go @@ -0,0 +1,201 @@ +// Package digest runs the background maintenance for a wiki's per- +// conversation orientation digest: a periodic rebuild of the word/ +// phrase cloud and a periodic flush of the active-use recents LRU +// to SQLite. +// +// The package mirrors internal/sync in shape: a Manager constructed +// over a *wiki.Wiki, with Start(ctx) / Stop() lifecycle that the +// embedder (cmd/mind-map, internal/httpapi) supervises. Keeping the +// tickers out of the Wiki itself preserves the same separation sync +// already established — the storage engine has no goroutines of its +// own; lifecycle is the embedder's concern. +package digest + +import ( + "context" + "log/slog" + "sync" + "time" + + "github.com/aniongithub/mind-map/internal/wiki" +) + +// Default tick intervals match the plan. Config-driven overrides +// land in Step 7; until then these are the only knobs and they're +// reasonable for any wiki size below the millions of pages. +const ( + defaultCloudRefresh = 5 * time.Minute + defaultRecentsRefresh = 30 * time.Second + + // defaultCloudSize matches the plan's cloud_size default. The + // top-K selection is the only knob that materially affects the + // rendered digest's word density; everything else is plumbing. + defaultCloudSize = 50 +) + +// Manager runs the two background tickers (cloud rebuild + recents +// flush) for a single wiki. Construct one with NewManager, hand its +// Start a context tied to the process lifetime, and call Stop before +// closing the wiki — closing the wiki out from under a mid-rebuild +// ticker is a `sql: database is closed` race waiting to happen. +// +// Safe for concurrent Start/Stop (idempotent via sync.Once); a single +// Manager is one-shot — once Stop has been called, the Manager cannot +// be Started again. Construct a fresh one if you need a restart. +type Manager struct { + w *wiki.Wiki + + cloudRefresh time.Duration + recentsRefresh time.Duration + cloudSize int + + startOnce sync.Once + stopOnce sync.Once + cancel context.CancelFunc + done chan struct{} +} + +// Options tunes Manager behavior. Zero-value Options uses the +// package defaults (5m cloud rebuild, 30s recents flush, top-50 +// cloud terms). Step 7 will wire these through config.json. +type Options struct { + CloudRefresh time.Duration + RecentsRefresh time.Duration + CloudSize int + // StopwordsExtra appends to the built-in English stopword list. + // Mirrors plan's digest.stopwords_extra config knob. + StopwordsExtra []string +} + +// NewManager constructs an unstarted Manager. Pass zero Options for +// defaults. +func NewManager(w *wiki.Wiki, opts Options) *Manager { + if opts.CloudRefresh <= 0 { + opts.CloudRefresh = defaultCloudRefresh + } + if opts.RecentsRefresh <= 0 { + opts.RecentsRefresh = defaultRecentsRefresh + } + if opts.CloudSize <= 0 { + opts.CloudSize = defaultCloudSize + } + return &Manager{ + w: w, + cloudRefresh: opts.CloudRefresh, + recentsRefresh: opts.RecentsRefresh, + cloudSize: opts.CloudSize, + } +} + +// Start kicks off the two tickers. Idempotent: a second call is a +// no-op. Returns immediately after spawning goroutines; use Stop to +// wait for clean shutdown. +// +// The cloud is rebuilt synchronously once before the goroutine loop +// starts so a freshly-opened wiki has cloud terms in its digest +// without a 5-minute warm-up. On cold start over a 1k-page wiki this +// takes < 100ms; we accept that latency on Start so the first +// post-open digest read is useful. +func (m *Manager) Start(ctx context.Context) { + m.startOnce.Do(func() { + ctx, m.cancel = context.WithCancel(ctx) + m.done = make(chan struct{}) + + // Synchronous first build so cold-start digests have an + // About: line. We deliberately don't gate on whether a + // persisted cloud was loaded: even if it was, the on-disk + // content may have shifted while the server was off, and + // the cost is small. A failure here logs and continues — + // the tickers below will retry. + m.rebuildCloud(ctx) + + go m.run(ctx) + slog.Info("digest manager started", + slog.Duration("cloud_refresh", m.cloudRefresh), + slog.Duration("recents_refresh", m.recentsRefresh), + slog.Int("cloud_size", m.cloudSize), + ) + }) +} + +// Stop cancels the tickers and blocks until the loop goroutine has +// exited. Idempotent. Safe to call after Start, after another Stop, +// or even without ever calling Start (in which case it returns +// immediately). +// +// A final recents flush runs as the loop exits so the last few touches +// between ticker fires aren't lost on shutdown. The Wiki's own Close() +// also calls persistRecents — both paths converge on the same row, +// and the SQLite write is atomic, so the redundancy is harmless. +func (m *Manager) Stop() { + m.stopOnce.Do(func() { + if m.cancel == nil { + return // Stop without Start: nothing to do. + } + m.cancel() + <-m.done + slog.Info("digest manager stopped") + }) +} + +// run is the goroutine that drives both tickers. The cloud rebuild +// is much heavier than the recents flush, but both are well below the +// 30s recents tick on any reasonable wiki size, so a shared goroutine +// with two tickers is simpler than two goroutines and adequately +// non-blocking for the workload. +func (m *Manager) run(ctx context.Context) { + defer close(m.done) + + cloudTick := time.NewTicker(m.cloudRefresh) + defer cloudTick.Stop() + recentsTick := time.NewTicker(m.recentsRefresh) + defer recentsTick.Stop() + + for { + select { + case <-ctx.Done(): + // Final flush so we don't lose the last ~30s of + // touches. Use a detached background context: the + // loop's ctx is already cancelled, but the DB write + // itself should still get a chance to complete. + m.flushRecents(context.Background()) + return + case <-cloudTick.C: + m.rebuildCloud(ctx) + case <-recentsTick.C: + m.flushRecents(ctx) + } + } +} + +// rebuildCloud runs one cloud rebuild + persistence cycle. Failures +// are logged and swallowed — the digest must degrade gracefully on +// transient errors rather than crashing a long-running service. +func (m *Manager) rebuildCloud(ctx context.Context) { + start := time.Now() + terms, err := m.w.BuildCloud(ctx, m.cloudSize, nil) + if err != nil { + slog.Warn("digest cloud rebuild failed", slog.Any("error", err)) + return + } + m.w.SetCloud(terms) + if err := m.w.PersistCloud(ctx); err != nil { + slog.Warn("digest cloud persist failed", slog.Any("error", err)) + } + slog.Info("digest cloud rebuilt", + slog.Int("terms", len(terms)), + slog.Duration("elapsed", time.Since(start)), + ) +} + +// flushRecents writes the LRU to wiki_state if it's been touched +// since the last write. The dirty gate avoids gratuitous SQLite writes +// on an idle server. +func (m *Manager) flushRecents(ctx context.Context) { + if !m.w.RecentsDirty() { + return + } + if err := m.w.PersistRecents(ctx); err != nil { + slog.Warn("digest recents persist failed", slog.Any("error", err)) + } +} diff --git a/internal/digest/manager_test.go b/internal/digest/manager_test.go new file mode 100644 index 0000000..d0f2759 --- /dev/null +++ b/internal/digest/manager_test.go @@ -0,0 +1,139 @@ +package digest + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/aniongithub/mind-map/internal/wiki" +) + +// testWiki creates a temporary wiki with a few seed pages so the +// cloud rebuild has something to count. Kept private to this test +// file — the public wiki package has its own testWiki, but we can't +// import test helpers across packages. +func testWiki(t *testing.T) *wiki.Wiki { + t.Helper() + dir := t.TempDir() + + pages := map[string]string{ + "index.md": "# Home\n\nThis wiki is about mind-map, digest, and SQLite.\n", + "projects/mind-map.md": "# mind-map\n\nA wiki engine. SQLite-backed. Digest support.\n", + "notes/sqlite.md": "# SQLite\n\nSQLite is fast and embedded. mind-map uses SQLite.\n", + } + for name, content := range pages { + full := filepath.Join(dir, name) + if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil { + t.Fatalf("mkdir %s: %v", name, err) + } + if err := os.WriteFile(full, []byte(content), 0o644); err != nil { + t.Fatalf("seed %s: %v", name, err) + } + } + + w, err := wiki.Open(dir) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { w.Close() }) + return w +} + +func TestManager_StartTriggersImmediateCloudRebuild(t *testing.T) { + w := testWiki(t) + + m := NewManager(w, Options{ + // Long tick so the ticker doesn't fire during the test — + // we want to assert the *synchronous* initial build only. + CloudRefresh: time.Hour, + RecentsRefresh: time.Hour, + }) + m.Start(context.Background()) + defer m.Stop() + + // After Start, the cloud cache should be populated and the digest + // markdown should contain an About: line. + d, err := w.Digest(context.Background()) + if err != nil { + t.Fatalf("Digest: %v", err) + } + if len(d.Cloud) == 0 { + t.Fatalf("cloud should be populated after Start, got empty") + } + if !strings.Contains(d.Markdown, "About:") { + t.Fatalf("digest missing About: line:\n%s", d.Markdown) + } +} + +func TestManager_StopIsIdempotent(t *testing.T) { + w := testWiki(t) + m := NewManager(w, Options{CloudRefresh: time.Hour, RecentsRefresh: time.Hour}) + m.Start(context.Background()) + + m.Stop() + m.Stop() // second Stop must not panic or block +} + +func TestManager_StopWithoutStartIsNoOp(t *testing.T) { + w := testWiki(t) + m := NewManager(w, Options{}) + m.Stop() // must not panic, must not hang +} + +func TestManager_RecentsFlushOnTick(t *testing.T) { + w := testWiki(t) + ctx := context.Background() + + m := NewManager(w, Options{ + CloudRefresh: time.Hour, + RecentsRefresh: 50 * time.Millisecond, + }) + m.Start(ctx) + defer m.Stop() + + // Touch via a real Wiki op so dirty flips on. + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if !w.RecentsDirty() { + t.Fatalf("LRU should be dirty after GetPage") + } + + // Wait for the ticker to fire and flush. + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + if !w.RecentsDirty() { + return // success: ticker flushed and cleared dirty + } + time.Sleep(20 * time.Millisecond) + } + t.Fatalf("LRU still dirty after 1s; ticker did not flush") +} + +func TestManager_StopFlushesRecents(t *testing.T) { + w := testWiki(t) + ctx := context.Background() + + m := NewManager(w, Options{ + // Long ticks so only the Stop-time flush can save us. + CloudRefresh: time.Hour, + RecentsRefresh: time.Hour, + }) + m.Start(ctx) + + if _, err := w.GetPage(ctx, "index"); err != nil { + t.Fatalf("GetPage: %v", err) + } + if !w.RecentsDirty() { + t.Fatalf("LRU should be dirty after touch") + } + + m.Stop() + + if w.RecentsDirty() { + t.Fatalf("Stop should have flushed dirty LRU; still dirty") + } +} diff --git a/internal/wiki/cloud.go b/internal/wiki/cloud.go index 3075ade..5fab2ba 100644 --- a/internal/wiki/cloud.go +++ b/internal/wiki/cloud.go @@ -201,6 +201,30 @@ func topK(counts map[string]int, k int) []CloudTerm { // // Caller owns the goroutine and the slot it's stored in; this function // just does the work. Step 6 wires it to the 5-minute ticker. +// BuildCloud computes the top-K most frequent terms across all page +// bodies. Exposed for the digest.Manager ticker — the implementation +// lives on the Wiki because it reads `pages` directly; the supervisor +// owns the scheduling. +// +// The result mixes unigrams and bigrams: bigrams are scored by their +// own frequency (no boost), so a phrase only beats a single word when +// it genuinely occurs more often. +func (w *Wiki) BuildCloud(ctx context.Context, k int, stopwordsExtra []string) ([]CloudTerm, error) { + return w.buildCloud(ctx, k, stopwordsExtra) +} + +// SetCloud installs a freshly-built cloud into the in-memory cache. +// Pairs with BuildCloud; the supervisor calls Build → Set → Persist. +func (w *Wiki) SetCloud(terms []CloudTerm) { + w.cloud.Set(terms) +} + +// PersistCloud writes the current cloud cache to wiki_state. Called +// by the digest.Manager after a successful rebuild. +func (w *Wiki) PersistCloud(ctx context.Context) error { + return w.persistCloud(ctx) +} + func (w *Wiki) buildCloud(ctx context.Context, k int, stopwordsExtra []string) ([]CloudTerm, error) { if err := ctx.Err(); err != nil { return nil, err diff --git a/internal/wiki/recents.go b/internal/wiki/recents.go index 0707e26..2a45055 100644 --- a/internal/wiki/recents.go +++ b/internal/wiki/recents.go @@ -212,6 +212,16 @@ func (r *recentsLRU) takeDirty() bool { return was } +// peekDirty returns whether the ring has unsaved changes without +// clearing the flag. Used by the digest.Manager's tick gate so the +// "did anything change?" probe doesn't race with the write that +// follows. +func (r *recentsLRU) peekDirty() bool { + r.mu.Lock() + defer r.mu.Unlock() + return r.dirty +} + // len returns the number of tracked paths. Test helper. func (r *recentsLRU) len() int { r.mu.Lock() diff --git a/internal/wiki/state.go b/internal/wiki/state.go index a621415..79bb030 100644 --- a/internal/wiki/state.go +++ b/internal/wiki/state.go @@ -196,3 +196,30 @@ func (w *Wiki) persistCloud(ctx context.Context) error { } return w.writeStateKey(ctx, stateKeyCloud, string(data)) } + +// PersistRecents is the exported entry point for the digest.Manager's +// 30-second flush ticker. The internal persistRecents helper is also +// called by Close() for a clean shutdown flush. +// +// PersistRecents clears the LRU's dirty flag on success: a follow-up +// RecentsDirty() will report false until the next touch. Callers that +// want to skip a redundant write should peek with RecentsDirty before +// calling this; PersistRecents itself always writes. +func (w *Wiki) PersistRecents(ctx context.Context) error { + if err := w.persistRecents(ctx); err != nil { + return err + } + // Clear dirty only after a successful write — if the write failed, + // the in-memory state is still ahead of disk and the next tick + // should retry. + w.recents.takeDirty() + return nil +} + +// RecentsDirty reports whether the LRU has unsaved changes since the +// last successful PersistRecents. Read-only — does not clear the flag. +// The digest.Manager uses this to skip redundant writes on an idle +// server. +func (w *Wiki) RecentsDirty() bool { + return w.recents.peekDirty() +} From 97aa7f9546c28a14baf885e19b30995bb275c1ed Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sun, 24 May 2026 00:03:29 -0700 Subject: [PATCH 7/8] feat: digest config knobs and documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the digest section to config.json with the five knobs called out in the plan: { "digest": { "cloud_size": 50, // top-K terms in cloud "recents_size": 20, // active-use LRU capacity "cloud_refresh": "5m", // rebuild interval (>=30s) "stopwords_extra": ["TODO"], // appends to built-in EN list "max_render_bytes": 4096 // soft cap on rendered markdown } } All fields are optional. A legacy config without a digest section loads cleanly and yields zero-valued fields that consumers interpret as 'use built-in defaults' — covered by an explicit backwards-compat test. ParseCloudRefresh floors at 30 seconds: any faster is wasted CPU for a signal nobody reads that often. Wiring: - wiki.Open(dir, opts ...OpenOption) — added variadic options so Open(dir) callers (10 in the tree, mostly tests) keep compiling unchanged. WithOptions(wiki.Options{...}) sets RecentsSize, MaxRenderBytes, and StopwordsExtra in one call. MaxRenderBytes semantics: > 0 trims, == 0 uses default, < 0 disables trimming. - cmd/mind-map: both runStdio and runHTTPServer now load config before opening the wiki, pass digest tunables through helpers wikiOptionsFromConfig / digestOptionsFromConfig. Stdio mode previously bypassed config entirely; now both modes are consistent and a single config.json controls both. - digest.Manager: StopwordsExtra is now forwarded into BuildCloud on every tick rebuild, not just the synchronous first build. The plumbing existed but was dropped on the floor — fixed. Docs: - SKILL.md: rewritten Getting Oriented section to feature get_wiki_digest as the canonical 'start of conversation' call, with get_wiki_context retained for backwards compatibility. Tool list updated. - README.md: tool count 10 → 11, new get_wiki_digest row, the legacy get_wiki_context row mentions it now returns digest fields too, and Wiki Features gets a digest bullet. Step 7 of the digest plan (mind-map/plans/digest). Plan now fully implemented end-to-end. --- README.md | 6 ++- SKILL.md | 22 ++++++++- cmd/mind-map/main.go | 54 ++++++++++++++++++----- internal/config/config.go | 48 +++++++++++++++++++- internal/config/config_test.go | 78 ++++++++++++++++++++++++++++++++ internal/digest/manager.go | 4 +- internal/wiki/digest.go | 15 ++++++- internal/wiki/wiki.go | 81 +++++++++++++++++++++++++++++----- 8 files changed, 280 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 22b9ba1..0ceafdb 100644 --- a/README.md +++ b/README.md @@ -84,12 +84,13 @@ The web UI is a static Preact app served by `mind-map serve` over HTTP. It uses Both modes use the same wiki engine and the same wiki directory (`~/.mind-map/wiki` by default). Multiple stdio processes can safely share the same wiki via SQLite page locking. -## MCP Tools (10 total) +## MCP Tools (11 total) | Tool | Description | |------|-------------| | `search_pages` | Full-text search across page titles and content (SQLite FTS5) | -| `get_wiki_context` | Wiki overview: page count, top-level directories, recent pages | +| `get_wiki_digest` | Per-conversation orientation: page count, word/phrase cloud, active-use recents LRU, per-area counts, ~4 KB rendered markdown. Call this at the start of every new conversation. | +| `get_wiki_context` | Wiki overview: page count, top-level directories, recent pages (mtime-sorted). Also returns the digest fields for new clients. | | `get_page` | Read a page with parsed frontmatter, body, outgoing links, and backlinks | | `create_page` | Create a new page (markdown with optional YAML frontmatter) | | `update_page` | Update an existing page's content | @@ -102,6 +103,7 @@ Both modes use the same wiki engine and the same wiki directory (`~/.mind-map/wi ## Wiki Features +- **Per-conversation digest**: a compact orientation blob (cloud of top terms, recents LRU, area counts, rendered markdown) for LLMs to consume at conversation start. Always-current; background job rebuilds every 5 minutes; persisted to SQLite across restarts. - **YAML frontmatter**: structured metadata on every page (`title`, `type`, `status`, custom fields) - **Wikilinks**: `[[target]]` and `[[display|target]]` syntax, resolved to clickable links - **Backlink index**: every page knows what links to it diff --git a/SKILL.md b/SKILL.md index 97ac592..5845ab2 100644 --- a/SKILL.md +++ b/SKILL.md @@ -4,6 +4,7 @@ description: A wiki for AI agents and humans -- search, read, and write markdown tools: - search_pages - get_wiki_context + - get_wiki_digest - get_page - create_page - update_page @@ -47,12 +48,29 @@ Use mind-map as your **persistent memory**: ## Getting Oriented -**Always start by understanding what's already in the wiki:** +**Always start a new conversation with the digest:** +``` +get_wiki_digest() +→ returns a compact markdown blob: page count, top word/phrase cloud + (what this wiki is about), pages you or other agents recently + touched (intent, not file-mtime), and per-area page counts. + ~4 KB cap, ~1K tokens — designed to fit any context budget. +``` + +The digest is always-current: a background job rebuilds the cloud +every few minutes and the recents LRU updates on every page op. +Persisted to SQLite so a fresh server restart already has signal. + +If you need the legacy mtime-sorted "recently modified pages" list +or the filesystem-derived top-level directory list, call: ``` get_wiki_context() -→ returns page count, top-level directories, and 20 most recently modified pages +→ same shape as before, plus the digest fields layered on for free. ``` +New clients should prefer `get_wiki_digest`; `get_wiki_context` +remains for backwards compatibility. + ## Searching ``` diff --git a/cmd/mind-map/main.go b/cmd/mind-map/main.go index f141c7d..a0234c6 100644 --- a/cmd/mind-map/main.go +++ b/cmd/mind-map/main.go @@ -88,7 +88,14 @@ func init() { func runStdio(cmd *cobra.Command, args []string) error { dir, _ := cmd.Flags().GetString("dir") - w, err := wiki.Open(dir) + cfgPath := config.DefaultPath() + cfg, err := config.Load(cfgPath) + if err != nil { + slog.Warn("failed to load config, using defaults", slog.Any("error", err)) + cfg = config.DefaultConfig() + } + + w, err := wiki.Open(dir, wiki.WithOptions(wikiOptionsFromConfig(cfg))) if err != nil { return fmt.Errorf("open wiki: %w", err) } @@ -98,7 +105,7 @@ func runStdio(cmd *cobra.Command, args []string) error { // recents flush) for the duration of the stdio session. Stop // before Close so a mid-rebuild ticker doesn't race the DB // shutdown. - dm := digest.NewManager(w, digest.Options{}) + dm := digest.NewManager(w, digestOptionsFromConfig(cfg)) dm.Start(cmd.Context()) defer dm.Stop() @@ -107,6 +114,31 @@ func runStdio(cmd *cobra.Command, args []string) error { return s.MCPServer().Run(cmd.Context(), &mcpsdk.StdioTransport{}) } +// wikiOptionsFromConfig maps the digest section of config.Config to +// the construction-time knobs the Wiki cares about (recents capacity, +// render cap, stopword extras). Zero/missing values keep the Wiki's +// own defaults — DigestConfig is documented as fully optional. +func wikiOptionsFromConfig(cfg *config.Config) wiki.Options { + d := cfg.Digest + return wiki.Options{ + RecentsSize: d.RecentsSize, + MaxRenderBytes: d.MaxRenderBytes, + StopwordsExtra: d.StopwordsExtra, + } +} + +// digestOptionsFromConfig maps the digest section to the runtime +// (ticker / rebuild) knobs the digest.Manager cares about. Same +// "zero means default" contract. +func digestOptionsFromConfig(cfg *config.Config) digest.Options { + d := cfg.Digest + return digest.Options{ + CloudRefresh: d.ParseCloudRefresh(), + CloudSize: d.CloudSize, + StopwordsExtra: d.StopwordsExtra, + } +} + func runServe(cmd *cobra.Command, args []string) error { dir, _ := cmd.Flags().GetString("dir") logFile, _ := cmd.Flags().GetString("log-file") @@ -154,7 +186,14 @@ func runServe(cmd *cobra.Command, args []string) error { // runHTTPServer wires the HTTP handler from internal/httpapi and serves it. // Shared by the interactive `serve` command and the system service. func runHTTPServer(addr, dir, webuiDir string, idleTimeout time.Duration, stopCh chan struct{}) error { - w, err := wiki.Open(dir) + cfgPath := config.DefaultPath() + cfg, err := config.Load(cfgPath) + if err != nil { + slog.Warn("failed to load config, using defaults", slog.Any("error", err)) + cfg = config.DefaultConfig() + } + + w, err := wiki.Open(dir, wiki.WithOptions(wikiOptionsFromConfig(cfg))) if err != nil { return fmt.Errorf("open wiki: %w", err) } @@ -176,17 +215,10 @@ func runHTTPServer(addr, dir, webuiDir string, idleTimeout time.Duration, stopCh return } }() - dm := digest.NewManager(w, digest.Options{}) + dm := digest.NewManager(w, digestOptionsFromConfig(cfg)) dm.Start(dctx) defer dm.Stop() - cfgPath := config.DefaultPath() - cfg, err := config.Load(cfgPath) - if err != nil { - slog.Warn("failed to load config, using defaults", slog.Any("error", err)) - cfg = config.DefaultConfig() - } - handler := httpapi.New(httpapi.Deps{ Wiki: w, CfgPath: cfgPath, diff --git a/internal/config/config.go b/internal/config/config.go index 0e447a4..84dc10f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -120,9 +120,55 @@ func (s *SyncConfig) Remotes() []string { return remotes } +// DigestConfig holds tunables for the per-conversation orientation +// digest (cloud rebuild, recents LRU, render cap, stopword extras). +// All fields are optional; zero or invalid values fall back to the +// built-in defaults. Documented in detail in mind-map/plans/digest. +type DigestConfig struct { + // CloudSize caps the top-K terms surfaced in the word cloud. + // Default 50. Tunable up if your wiki is large enough that 50 + // terms feels too sparse; down if context budget is tight. + CloudSize int `json:"cloud_size,omitempty"` + + // RecentsSize caps the active-use LRU ring. Default 20. Applied + // at wiki Open; live changes via /api/settings take effect after + // the next server restart. + RecentsSize int `json:"recents_size,omitempty"` + + // CloudRefresh controls how often the cloud rebuilds. Default 5m. + // Accepts any time.ParseDuration value; values below 30 seconds + // are clamped up so a busy wiki doesn't burn CPU. + CloudRefresh string `json:"cloud_refresh,omitempty"` + + // StopwordsExtra extends the built-in English stopword list. + // Words are case-folded on load. Useful for domain-specific + // noise like "TODO" or "FIXME". + StopwordsExtra []string `json:"stopwords_extra,omitempty"` + + // MaxRenderBytes caps the rendered markdown blob. Default 4096 + // (~1K tokens for most LLMs). Trim discipline when over: drop + // recents, then cloud, never areas/header/footer. + MaxRenderBytes int `json:"max_render_bytes,omitempty"` +} + +// ParseCloudRefresh returns the cloud rebuild interval. Returns the +// default (5m) if empty or invalid. Floor at 30 seconds — anything +// faster is wasted CPU for a signal nobody reads that often. +func (d *DigestConfig) ParseCloudRefresh() time.Duration { + if d.CloudRefresh == "" { + return 5 * time.Minute + } + v, err := time.ParseDuration(d.CloudRefresh) + if err != nil || v < 30*time.Second { + return 5 * time.Minute + } + return v +} + // Config holds all runtime settings. type Config struct { - Sync SyncConfig `json:"sync"` + Sync SyncConfig `json:"sync"` + Digest DigestConfig `json:"digest,omitempty"` } // DefaultConfig returns a Config with sensible defaults. diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 0af2e6a..bf7acb6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -172,3 +172,81 @@ func TestSaveAndLoad(t *testing.T) { t.Errorf("loaded mapping prefix = %q", loaded.Sync.Mappings[0].Prefix) } } + +func TestParseCloudRefresh(t *testing.T) { + tests := []struct { + input string + want time.Duration + }{ + {"5m", 5 * time.Minute}, + {"10m", 10 * time.Minute}, + {"1h", 1 * time.Hour}, + // Floor: anything < 30s clamps to the default to protect a + // busy wiki from CPU churn. + {"1s", 5 * time.Minute}, + {"", 5 * time.Minute}, // empty → default + {"junk", 5 * time.Minute}, // invalid → default + } + for _, tc := range tests { + d := DigestConfig{CloudRefresh: tc.input} + if got := d.ParseCloudRefresh(); got != tc.want { + t.Errorf("ParseCloudRefresh(%q) = %v, want %v", tc.input, got, tc.want) + } + } +} + +func TestDigestConfig_RoundtripJSON(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.json") + + cfg := DefaultConfig() + cfg.Digest.CloudSize = 75 + cfg.Digest.RecentsSize = 30 + cfg.Digest.CloudRefresh = "10m" + cfg.Digest.StopwordsExtra = []string{"TODO", "FIXME"} + cfg.Digest.MaxRenderBytes = 8192 + + if err := Save(path, cfg); err != nil { + t.Fatalf("Save: %v", err) + } + loaded, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if loaded.Digest.CloudSize != 75 { + t.Errorf("CloudSize = %d, want 75", loaded.Digest.CloudSize) + } + if loaded.Digest.RecentsSize != 30 { + t.Errorf("RecentsSize = %d, want 30", loaded.Digest.RecentsSize) + } + if loaded.Digest.ParseCloudRefresh() != 10*time.Minute { + t.Errorf("CloudRefresh = %v, want 10m", loaded.Digest.ParseCloudRefresh()) + } + if len(loaded.Digest.StopwordsExtra) != 2 || loaded.Digest.StopwordsExtra[0] != "TODO" { + t.Errorf("StopwordsExtra = %v", loaded.Digest.StopwordsExtra) + } + if loaded.Digest.MaxRenderBytes != 8192 { + t.Errorf("MaxRenderBytes = %d, want 8192", loaded.Digest.MaxRenderBytes) + } +} + +func TestDigestConfig_BackwardsCompatible(t *testing.T) { + // A config file written before the digest section existed must + // still load without errors and yield zero-valued digest fields + // (which the consumers treat as "use defaults"). + dir := t.TempDir() + path := filepath.Join(dir, "config.json") + if err := os.WriteFile(path, []byte(`{"sync":{"enabled":false,"interval":"30s"}}`), 0o600); err != nil { + t.Fatalf("write legacy config: %v", err) + } + loaded, err := Load(path) + if err != nil { + t.Fatalf("Load legacy config: %v", err) + } + if loaded.Digest.CloudSize != 0 { + t.Errorf("expected zero CloudSize on legacy config, got %d", loaded.Digest.CloudSize) + } + if loaded.Digest.ParseCloudRefresh() != 5*time.Minute { + t.Errorf("expected default 5m on legacy config, got %v", loaded.Digest.ParseCloudRefresh()) + } +} diff --git a/internal/digest/manager.go b/internal/digest/manager.go index c6c0c10..9282464 100644 --- a/internal/digest/manager.go +++ b/internal/digest/manager.go @@ -48,6 +48,7 @@ type Manager struct { cloudRefresh time.Duration recentsRefresh time.Duration cloudSize int + stopwordsExtra []string startOnce sync.Once stopOnce sync.Once @@ -84,6 +85,7 @@ func NewManager(w *wiki.Wiki, opts Options) *Manager { cloudRefresh: opts.CloudRefresh, recentsRefresh: opts.RecentsRefresh, cloudSize: opts.CloudSize, + stopwordsExtra: opts.StopwordsExtra, } } @@ -173,7 +175,7 @@ func (m *Manager) run(ctx context.Context) { // transient errors rather than crashing a long-running service. func (m *Manager) rebuildCloud(ctx context.Context) { start := time.Now() - terms, err := m.w.BuildCloud(ctx, m.cloudSize, nil) + terms, err := m.w.BuildCloud(ctx, m.cloudSize, m.stopwordsExtra) if err != nil { slog.Warn("digest cloud rebuild failed", slog.Any("error", err)) return diff --git a/internal/wiki/digest.go b/internal/wiki/digest.go index e61da64..468a98c 100644 --- a/internal/wiki/digest.go +++ b/internal/wiki/digest.go @@ -125,12 +125,25 @@ func (w *Wiki) Digest(ctx context.Context) (*Digest, error) { Recents: recents, Areas: areas, } - d.Markdown = renderDigestMarkdown(d, defaultMaxRenderBytes) + d.Markdown = renderDigestMarkdown(d, w.renderCap()) w.digest.set(cloudVer, recentsSeq, pageCount, d) return d, nil } +// renderCap returns the effective byte cap to pass into the markdown +// renderer. Normalized in Open() to: +// +// > 0 → trim to that size +// == 0 → defaulted, never observed here +// < 0 → no trimming +// +// The renderer treats <= 0 uniformly as "no trim," so we forward +// negative values straight through. +func (w *Wiki) renderCap() int { + return w.maxRenderBytes +} + // pageCount runs the same SELECT COUNT(*) the Context handler uses. // Lifted into a helper so Digest can share it. func (w *Wiki) pageCount(ctx context.Context) (int, error) { diff --git a/internal/wiki/wiki.go b/internal/wiki/wiki.go index e73c3fe..2c668ac 100644 --- a/internal/wiki/wiki.go +++ b/internal/wiki/wiki.go @@ -80,6 +80,38 @@ type WikiContext struct { Markdown string `json:"markdown,omitempty"` } +// Options tunes Wiki construction. All fields are optional; the zero +// value gives the built-in defaults (recents capacity 20, render cap +// 4 KB, no extra stopwords). Pass with WithOptions to Open(): +// +// w, err := wiki.Open(dir, wiki.WithOptions(wiki.Options{ +// RecentsSize: 50, +// MaxRenderBytes: 8192, +// })) +// +// Options is value-passed; mutating it after Open has no effect. +type Options struct { + // RecentsSize is the active-use LRU capacity. Default 20. + RecentsSize int + // MaxRenderBytes caps the rendered digest markdown. Default 4096. + MaxRenderBytes int + // StopwordsExtra is forwarded to the cloud builder when invoked + // directly via BuildCloud. The digest.Manager passes its own + // copy through Options on its Manager; this field is here so + // non-Manager callers (tests, ad-hoc tools) get the same set. + StopwordsExtra []string +} + +// OpenOption configures wiki.Open. Use WithOptions or future targeted +// helpers; the variadic form keeps Open(dir) source-compatible. +type OpenOption func(*Options) + +// WithOptions sets the entire Options struct in one call. The most +// common embedder path: read config, build Options, pass to Open. +func WithOptions(opts Options) OpenOption { + return func(o *Options) { *o = opts } +} + // Wiki is the core engine. Create one with Open(). type Wiki struct { root string // absolute path to wiki directory @@ -95,6 +127,12 @@ type Wiki struct { // digest caches the rendered markdown blob, invalidated by cloud // version + recents seq changes. See digest.go. digest *digestCache + // maxRenderBytes is the soft cap applied by Digest(); 0 means no + // trim (used by tests). + maxRenderBytes int + // stopwordsExtra is forwarded to buildCloud when called directly + // without an explicit extras list. + stopwordsExtra []string // closed guards Close() against double-invocation: testWiki and // other callers commonly stack defer Close on top of t.Cleanup. // Without this guard, the second Close() runs persistRecents @@ -105,7 +143,31 @@ type Wiki struct { // Open opens (or creates) a wiki rooted at the given directory. // It initializes the SQLite index and performs an initial scan. -func Open(root string) (*Wiki, error) { +// Pass OpenOption values (typically a single WithOptions) to tune the +// digest signals; the default options match the digest plan's +// recommended values (LRU=20, render cap=4096, no extra stopwords). +func Open(root string, opts ...OpenOption) (*Wiki, error) { + o := Options{ + RecentsSize: 20, + MaxRenderBytes: defaultMaxRenderBytes, + } + for _, fn := range opts { + fn(&o) + } + if o.RecentsSize <= 0 { + o.RecentsSize = 20 + } + // MaxRenderBytes semantics: + // > 0 → trim to that many bytes + // == 0 → fall back to default (4096) — most likely an + // uninitialized Options struct + // < 0 → no trimming (tests / power users) + // The field is normalized to those three states here so digest + // rendering can just check the sign without re-deriving intent. + if o.MaxRenderBytes == 0 { + o.MaxRenderBytes = defaultMaxRenderBytes + } + absRoot, err := filepath.Abs(root) if err != nil { return nil, fmt.Errorf("resolve wiki root: %w", err) @@ -126,15 +188,14 @@ func Open(root string) (*Wiki, error) { sessionID := fmt.Sprintf("pid-%d-%d", os.Getpid(), time.Now().UnixNano()) w := &Wiki{ - root: absRoot, - db: db, - sessionID: sessionID, - // Capacity 20 matches the plan default. Step 4 will swap this - // for a config-driven value (digest.recents_size); the default - // keeps existing callers unaffected. - recents: newRecentsLRU(20), - cloud: &cloudCache{}, - digest: &digestCache{}, + root: absRoot, + db: db, + sessionID: sessionID, + recents: newRecentsLRU(o.RecentsSize), + cloud: &cloudCache{}, + digest: &digestCache{}, + maxRenderBytes: o.MaxRenderBytes, + stopwordsExtra: o.StopwordsExtra, } if err := w.initSchema(); err != nil { db.Close() From 380df0eafdec9b41bad3af5f9ebe508e889bda7a Mon Sep 17 00:00:00 2001 From: aniongithub Date: Sun, 24 May 2026 00:18:52 -0700 Subject: [PATCH 8/8] feat(webui): digest settings section with tag-input for stopwords MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five new controls in the settings panel, between Sync and Index: - Extra Stopwords → tag-input (comma / space / Enter to commit a chip; Backspace on empty input pops the last) - Cloud Size → number input, blank = server default (50) - Recents Size → number input, blank = server default (20) - Cloud Refresh → text input (5m, 10m, etc.), blank = 5m - Max Render Bytes → number input, 0 disables trim, blank = 4096 A new TagInput component (webui/src/TagInput.tsx) implements the chips UX: type → commit on separator → click × or Backspace to remove. Pasted strings with commas or whitespace fan out into multiple chips in one shot, so an operator can paste 'TODO, FIXME, see also' and get four tags. Duplicate detection is case-insensitive but display preserves what the user typed; the case-folding for matching happens server-side in the cloud builder. CSS uses the existing --accent / --border palette so chips themeIn match the rest of the settings UI in both light and dark mode. No backend changes: putSettings already unmarshals the full config.Config (which gained the Digest section in step 7 of the digest plan), so the new fields round-trip transparently. Changes take effect on next restart — same contract as Sync.Interval; the existing 'Settings saved. Restart to apply.' banner already says so. Closes the loop on the digest plan's stopword tuning observation: operators can now add domain-specific noise words from the UI without editing config.json by hand. --- webui/src/App.tsx | 105 +++++++++++++++++++++++++++++++++ webui/src/TagInput.tsx | 130 +++++++++++++++++++++++++++++++++++++++++ webui/src/styles.css | 71 ++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 webui/src/TagInput.tsx diff --git a/webui/src/App.tsx b/webui/src/App.tsx index 4d6c126..e293d34 100644 --- a/webui/src/App.tsx +++ b/webui/src/App.tsx @@ -4,6 +4,7 @@ import { Logo } from './Logo'; import { PageBrowser } from './PageBrowser'; import { GraphView } from './GraphView'; import { searchTokens, searchRegex, Highlighted } from './search'; +import { TagInput } from './TagInput'; import { marked } from 'marked'; import mermaid from 'mermaid'; @@ -16,8 +17,22 @@ interface SyncSettings { mappings?: { prefix: string; remote: string }[]; } +// DigestSettings mirrors internal/config.DigestConfig. All fields are +// optional on the wire — a config file without a digest section +// loads with zero values, which the consumers interpret as "use the +// built-in defaults". The UI surfaces the same contract: empty +// numeric inputs and an empty tag list keep server defaults intact. +interface DigestSettings { + cloud_size?: number; + recents_size?: number; + cloud_refresh?: string; + stopwords_extra?: string[]; + max_render_bytes?: number; +} + interface Settings { sync: SyncSettings; + digest?: DigestSettings; } async function loadSettings(): Promise { @@ -339,6 +354,21 @@ export function App() { setSettingsSaved(false); }; + // updateDigest is the per-field mutator for the Digest section. + // It accepts the field's actual value type (number for numeric + // knobs, string for cloud_refresh, string[] for stopwords). The + // server omits the digest section entirely when it's never been + // set, so we lazily materialize an empty object on first touch. + const updateDigest = (field: K, value: DigestSettings[K]) => { + if (!settings) return; + setSettings({ + ...settings, + digest: { ...(settings.digest ?? {}), [field]: value }, + }); + setSettingsDirty(true); + setSettingsSaved(false); + }; + const renderMarkdown = (body: string): string => { // Convert [[wikilinks]] to clickable links before rendering const withLinks = body.replace(/\[\[([^\]|]+)(?:\|([^\]]+))?\]\]/g, (_, target, display) => { @@ -562,6 +592,81 @@ export function App() { )} +
+
Digest
+
+ The per-conversation orientation digest summarizes what this wiki is about. A background job rebuilds the word/phrase cloud on a schedule; the recents LRU updates on every page op. All fields are optional — leave blank to use defaults. +
+ +
+ +
+ Domain-specific noise to exclude from the cloud (e.g. TODO, FIXME, see, also). Comma, space, or Enter to add a tag; Backspace on empty input removes the last one. +
+ updateDigest('stopwords_extra', next)} + placeholder="Type a word and press space, comma, or Enter" + /> +
+ +
+ +
Top-K terms in the word/phrase cloud. Default 50.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('cloud_size', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="50" + /> +
+ +
+ +
Active-use LRU capacity. Default 20. Applied on next restart.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('recents_size', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="20" + /> +
+ +
+ +
How often the cloud rebuilds (e.g. 5m, 10m). Floor: 30s. Default 5m.
+ updateDigest('cloud_refresh', (e.target as HTMLInputElement).value || undefined)} + placeholder="5m" + /> +
+ +
+ +
Soft cap on the rendered markdown blob. Default 4096 (~1K tokens). Set to 0 to disable trimming.
+ { + const v = (e.target as HTMLInputElement).value; + updateDigest('max_render_bytes', v === '' ? undefined : parseInt(v, 10)); + }} + placeholder="4096" + /> +
+
+
Index
diff --git a/webui/src/TagInput.tsx b/webui/src/TagInput.tsx new file mode 100644 index 0000000..96048e0 --- /dev/null +++ b/webui/src/TagInput.tsx @@ -0,0 +1,130 @@ +import { useState, useRef } from 'preact/hooks'; + +interface TagInputProps { + value: string[]; + onChange: (next: string[]) => void; + placeholder?: string; + // Maximum number of tags. When reached, further input is blocked + // until the user removes a tag. Omitted = no limit. + maxTags?: number; +} + +// TagInput is a controlled "chips + textbox" control: type a word, +// hit comma, space, or Enter, and it becomes a tag. Backspace on an +// empty input deletes the previous tag (standard chip-input UX — +// matches Gmail's To: line, GitHub's labels, etc.). Pasting a +// comma- or whitespace-separated string creates multiple tags in +// one shot. +// +// Values are de-duplicated case-insensitively but preserved in the +// case the user typed — we don't want to fold "JWT" into "jwt" on +// the way back to the server. The consumer of the values (the cloud +// builder) is the one that case-folds for matching; storing the +// user's intent verbatim respects what they typed. +export function TagInput({ value, onChange, placeholder, maxTags }: TagInputProps) { + const [draft, setDraft] = useState(''); + const inputRef = useRef(null); + + const commit = (raw: string) => { + // Split on commas and whitespace so pasting a list works + // even if the user pasted "TODO, FIXME see" (mixed + // separators). Empty fragments are filtered out by trim. + const fragments = raw + .split(/[\s,]+/) + .map(s => s.trim()) + .filter(Boolean); + if (fragments.length === 0) return; + + const lowerExisting = new Set(value.map(v => v.toLowerCase())); + const additions: string[] = []; + for (const f of fragments) { + if (lowerExisting.has(f.toLowerCase())) continue; + if (maxTags && value.length + additions.length >= maxTags) break; + lowerExisting.add(f.toLowerCase()); + additions.push(f); + } + if (additions.length > 0) onChange([...value, ...additions]); + setDraft(''); + }; + + const removeAt = (idx: number) => { + const next = value.slice(); + next.splice(idx, 1); + onChange(next); + // Keep focus on the input so the user can keep editing. + inputRef.current?.focus(); + }; + + const onKeyDown = (e: KeyboardEvent) => { + // Commit triggers: Enter, comma, space. Comma and space need + // to be intercepted so they don't actually land in the input. + if (e.key === 'Enter' || e.key === ',' || e.key === ' ') { + // Don't commit on a leading space inside an in-progress + // word — user might be pasting and the paste handler + // will fire separately. Specifically: only commit when + // there's something to commit. + if (draft.trim() !== '') { + e.preventDefault(); + commit(draft); + } else if (e.key === ',' || e.key === ' ') { + // Swallow stray separators on an empty input so the + // box doesn't fill with whitespace. + e.preventDefault(); + } + return; + } + if (e.key === 'Backspace' && draft === '' && value.length > 0) { + e.preventDefault(); + removeAt(value.length - 1); + } + }; + + const onPaste = (e: ClipboardEvent) => { + const pasted = e.clipboardData?.getData('text') ?? ''; + if (/[\s,]/.test(pasted)) { + // The paste contains separators — handle the whole + // string as tags rather than letting it land in the + // input field where the user would have to manually + // split it. + e.preventDefault(); + commit(pasted); + } + }; + + return ( +
inputRef.current?.focus()}> + {value.map((tag, idx) => ( + + {tag} + + + ))} + setDraft((e.target as HTMLInputElement).value)} + onKeyDown={onKeyDown} + onPaste={onPaste} + onBlur={() => { + // Commit any in-progress draft on blur so the user + // doesn't have to remember the keyboard ritual when + // they tab away or click Save. + if (draft.trim() !== '') commit(draft); + }} + /> +
+ ); +} diff --git a/webui/src/styles.css b/webui/src/styles.css index 97e54ce..a9abb7e 100644 --- a/webui/src/styles.css +++ b/webui/src/styles.css @@ -817,3 +817,74 @@ mark { @media (prefers-color-scheme: dark) { .settings-reindex-error { color: #ff8080; } } + +/* Tag input (Digest > Extra Stopwords). + * Looks and behaves like a single .settings-field input: same border, + * same focus accent, same width cap. The internal chips wrap and the + * input field stretches to fill the remaining row. */ +.tag-input { + width: 100%; + max-width: 480px; + min-height: 36px; + padding: 4px 6px; + border: 1px solid var(--border); + background: var(--bg); + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 4px; + cursor: text; +} + +.tag-input:focus-within { + border-color: var(--accent); +} + +.tag { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 2px 4px 2px 8px; + background: var(--accent); + color: white; + font-family: var(--font-mono); + font-size: 12px; + line-height: 1.4; + border-radius: 2px; + user-select: none; +} + +.tag-label { + /* Allow long tags to wrap or truncate gracefully if someone + * pastes a paragraph by mistake. Word-break here keeps the tag + * pill compact in the row. */ + overflow-wrap: anywhere; + max-width: 200px; +} + +.tag-remove { + background: transparent; + border: none; + color: white; + cursor: pointer; + padding: 0 4px; + font-size: 14px; + line-height: 1; + opacity: 0.8; +} + +.tag-remove:hover { + opacity: 1; +} + +.tag-input-field { + flex: 1; + min-width: 120px; + border: none; + outline: none; + background: transparent; + color: var(--fg); + font-family: var(--font-mono); + font-size: 13px; + padding: 4px 2px; +}