From ecaf247168b54f634412e1bb1203fc413b902771 Mon Sep 17 00:00:00 2001 From: phernandez Date: Wed, 15 Apr 2026 18:59:19 -0500 Subject: [PATCH 1/4] fix(sync): preserve canonical markdown in single-file sync Signed-off-by: phernandez --- .claude/settings.json | 1 - src/basic_memory/indexing/__init__.py | 2 + src/basic_memory/indexing/batch_indexer.py | 170 +++++++++++++--- src/basic_memory/indexing/models.py | 18 +- src/basic_memory/services/file_service.py | 6 + src/basic_memory/sync/sync_service.py | 143 +++++++------- tests/indexing/test_batch_indexer.py | 63 ++++++ tests/sync/test_sync_one_markdown_file.py | 218 +++++++++++++++++++++ 8 files changed, 518 insertions(+), 103 deletions(-) create mode 100644 tests/sync/test_sync_one_markdown_file.py diff --git a/.claude/settings.json b/.claude/settings.json index 54c66f76..d55f953f 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -3,7 +3,6 @@ "env": { "CLAUDE_BASH_MAINTAIN_PROJECT_WORKING_DIR": "1", "CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY": "1", - "DISABLE_TELEMETRY": "1", "CLAUDE_CODE_NO_FLICKER": "1", "CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING": "1" }, diff --git a/src/basic_memory/indexing/__init__.py b/src/basic_memory/indexing/__init__.py index 79742d6f..28966eec 100644 --- a/src/basic_memory/indexing/__init__.py +++ b/src/basic_memory/indexing/__init__.py @@ -12,6 +12,7 @@ IndexingBatchResult, IndexInputFile, IndexProgress, + SyncedMarkdownFile, ) __all__ = [ @@ -25,5 +26,6 @@ "IndexingBatchResult", "IndexInputFile", "IndexProgress", + "SyncedMarkdownFile", "build_index_batches", ] diff --git a/src/basic_memory/indexing/batch_indexer.py b/src/basic_memory/indexing/batch_indexer.py index 63792512..1a9c796c 100644 --- a/src/basic_memory/indexing/batch_indexer.py +++ b/src/basic_memory/indexing/batch_indexer.py @@ -43,12 +43,19 @@ class _PreparedMarkdownFile: class _PreparedEntity: path: str entity_id: int + permalink: str | None checksum: str content_type: str | None search_content: str | None markdown_content: str | None = None +@dataclass(slots=True) +class _PersistedMarkdownFile: + prepared: _PreparedMarkdownFile + entity: Entity + + class BatchIndexer: """Index already-loaded files without assuming where they came from.""" @@ -118,6 +125,9 @@ async def index_files( ) error_by_path.update(markdown_errors) prepared_entities.update(markdown_upserts) + if existing_permalink_by_path is not None: + for path, prepared_entity in markdown_upserts.items(): + existing_permalink_by_path[path] = prepared_entity.permalink regular_upserts, regular_errors = await self._run_bounded( regular_paths, @@ -168,6 +178,57 @@ async def index_files( search_indexed=search_indexed, ) + async def index_markdown_file( + self, + file: IndexInputFile, + *, + new: bool | None = None, + existing_permalink_by_path: dict[str, str | None] | None = None, + index_search: bool = True, + ) -> IndexedEntity: + """Index one markdown file using the same normalization and upsert path as batches.""" + if not self._is_markdown(file): + raise ValueError(f"index_markdown_file requires markdown input: {file.path}") + + prepared = await self._prepare_markdown_file(file) + if existing_permalink_by_path is None: + existing_permalink_by_path = { + path: permalink + for path, permalink in ( + await self.entity_repository.get_file_path_to_permalink_map() + ).items() + } + + reserved_permalinks = { + permalink + for path, permalink in existing_permalink_by_path.items() + if path != file.path and permalink + } + prepared = await self._normalize_markdown_file(prepared, reserved_permalinks) + existing_permalink_by_path[file.path] = prepared.markdown.frontmatter.permalink + + persisted = await self._persist_markdown_file(prepared, is_new=new) + existing_permalink_by_path[file.path] = persisted.entity.permalink + await self._resolve_batch_relations([persisted.entity.id], max_concurrent=1) + + refreshed = await self.entity_repository.find_by_ids([persisted.entity.id]) + if len(refreshed) != 1: # pragma: no cover + raise ValueError(f"Failed to reload indexed entity for {file.path}") + entity = refreshed[0] + prepared_entity = self._build_prepared_entity(persisted.prepared, entity) + + if index_search: + return await self._refresh_search_index(prepared_entity, entity) + + return IndexedEntity( + path=prepared_entity.path, + entity_id=entity.id, + permalink=entity.permalink, + checksum=prepared_entity.checksum, + content_type=prepared_entity.content_type, + markdown_content=prepared_entity.markdown_content, + ) + # --- Preparation --- async def _prepare_markdown_file(self, file: IndexInputFile) -> _PreparedMarkdownFile: @@ -320,34 +381,8 @@ def _reserve_batch_permalink( # --- Persistence --- async def _upsert_markdown_file(self, prepared: _PreparedMarkdownFile) -> _PreparedEntity: - existing = await self.entity_repository.get_by_file_path( - prepared.file.path, - load_relations=False, - ) - entity = await self.entity_service.upsert_entity_from_markdown( - Path(prepared.file.path), - prepared.markdown, - is_new=existing is None, - ) - updated = await self.entity_repository.update( - entity.id, - self._entity_metadata_updates(prepared.file, prepared.final_checksum), - ) - if updated is None: - raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}") - - return _PreparedEntity( - path=prepared.file.path, - entity_id=updated.id, - checksum=prepared.final_checksum, - content_type=prepared.file.content_type, - search_content=( - prepared.markdown.content - if prepared.markdown.content is not None - else prepared.content - ), - markdown_content=prepared.content, - ) + persisted = await self._persist_markdown_file(prepared) + return self._build_prepared_entity(persisted.prepared, persisted.entity) async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity: checksum = await self._resolve_checksum(file) @@ -405,6 +440,7 @@ async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity: return _PreparedEntity( path=file.path, entity_id=updated.id, + permalink=updated.permalink, checksum=checksum, content_type=file.content_type, search_content=None, @@ -495,6 +531,84 @@ async def _refresh_search_index( # --- Helpers --- + async def _persist_markdown_file( + self, + prepared: _PreparedMarkdownFile, + *, + is_new: bool | None = None, + ) -> _PersistedMarkdownFile: + existing = await self.entity_repository.get_by_file_path( + prepared.file.path, + load_relations=False, + ) + if is_new is None: + is_new = existing is None + entity = await self.entity_service.upsert_entity_from_markdown( + Path(prepared.file.path), + prepared.markdown, + is_new=is_new, + ) + prepared = await self._reconcile_persisted_permalink(prepared, entity) + updated = await self.entity_repository.update( + entity.id, + self._entity_metadata_updates(prepared.file, prepared.final_checksum), + ) + if updated is None: + raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}") + return _PersistedMarkdownFile(prepared=prepared, entity=updated) + + async def _reconcile_persisted_permalink( + self, + prepared: _PreparedMarkdownFile, + entity: Entity, + ) -> _PreparedMarkdownFile: + if ( + self.app_config.disable_permalinks + or entity.permalink is None + or entity.permalink == prepared.markdown.frontmatter.permalink + ): + return prepared + + logger.debug( + "Updating permalink after upsert conflict resolution", + path=prepared.file.path, + old_permalink=prepared.markdown.frontmatter.permalink, + new_permalink=entity.permalink, + ) + prepared.markdown.frontmatter.metadata["permalink"] = entity.permalink + write_result = await self.file_writer.write_frontmatter( + IndexFrontmatterUpdate( + path=prepared.file.path, + metadata={"permalink": entity.permalink}, + ) + ) + return _PreparedMarkdownFile( + file=prepared.file, + content=write_result.content, + final_checksum=write_result.checksum, + markdown=prepared.markdown, + file_contains_frontmatter=prepared.file_contains_frontmatter, + ) + + def _build_prepared_entity( + self, + prepared: _PreparedMarkdownFile, + entity: Entity, + ) -> _PreparedEntity: + return _PreparedEntity( + path=prepared.file.path, + entity_id=entity.id, + permalink=entity.permalink, + checksum=prepared.final_checksum, + content_type=prepared.file.content_type, + search_content=( + prepared.markdown.content + if prepared.markdown.content is not None + else prepared.content + ), + markdown_content=prepared.content, + ) + async def _resolve_checksum(self, file: IndexInputFile) -> str: if file.checksum is not None: return file.checksum diff --git a/src/basic_memory/indexing/models.py b/src/basic_memory/indexing/models.py index aaa5b869..aab16203 100644 --- a/src/basic_memory/indexing/models.py +++ b/src/basic_memory/indexing/models.py @@ -4,7 +4,10 @@ from dataclasses import dataclass, field from datetime import datetime -from typing import Any, Protocol +from typing import Any, Protocol, TYPE_CHECKING + +if TYPE_CHECKING: # pragma: no cover + from basic_memory.models import Entity @dataclass(slots=True) @@ -75,6 +78,19 @@ class IndexedEntity: markdown_content: str | None = None +@dataclass(slots=True) +class SyncedMarkdownFile: + """Canonical result for syncing one markdown file end-to-end.""" + + entity: Entity + checksum: str + markdown_content: str + file_path: str + content_type: str + updated_at: datetime + size: int + + @dataclass(slots=True) class IndexingBatchResult: """Outcome for one batch execution.""" diff --git a/src/basic_memory/services/file_service.py b/src/basic_memory/services/file_service.py index d73e5e1a..9b12c240 100644 --- a/src/basic_memory/services/file_service.py +++ b/src/basic_memory/services/file_service.py @@ -273,6 +273,9 @@ async def read_file_content(self, path: FilePath) -> str: logger.warning("File not found", operation="read_file_content", path=str(full_path)) raise except Exception as e: + if isinstance(e, FileNotFoundError): + logger.warning("File not found", operation="read_file", path=str(full_path)) + raise logger.exception("File read error", path=str(full_path), error=str(e)) raise FileOperationError(f"Failed to read file: {e}") @@ -366,6 +369,9 @@ async def read_file(self, path: FilePath) -> Tuple[str, str]: ) return content, checksum + except FileNotFoundError as e: + logger.warning("File not found", operation="read_file", path=str(full_path)) + raise FileOperationError(f"Failed to read file: {e}") from e except Exception as e: logger.exception("File read error", path=str(full_path), error=str(e)) raise FileOperationError(f"Failed to read file: {e}") diff --git a/src/basic_memory/sync/sync_service.py b/src/basic_memory/sync/sync_service.py index 0cfa7302..c7f2f083 100644 --- a/src/basic_memory/sync/sync_service.py +++ b/src/basic_memory/sync/sync_service.py @@ -18,8 +18,14 @@ from basic_memory import telemetry from basic_memory import db from basic_memory.config import BasicMemoryConfig, ConfigManager -from basic_memory.file_utils import compute_checksum, has_frontmatter -from basic_memory.indexing import BatchIndexer, IndexFileMetadata, IndexInputFile, IndexProgress +from basic_memory.file_utils import compute_checksum, remove_frontmatter +from basic_memory.indexing import ( + BatchIndexer, + IndexFileMetadata, + IndexInputFile, + IndexProgress, + SyncedMarkdownFile, +) from basic_memory.indexing.batching import build_index_batches from basic_memory.indexing.models import ( IndexedEntity, @@ -1052,91 +1058,82 @@ async def sync_markdown_file(self, path: str, new: bool = True) -> Tuple[Optiona Returns: Tuple of (entity, checksum) """ - # Parse markdown first to get any existing permalink - logger.debug(f"Parsing markdown file, path: {path}, new: {new}") + synced = await self.sync_one_markdown_file(path, new=new, index_search=False) + return synced.entity, synced.checksum + + async def sync_one_markdown_file( + self, + path: str, + *, + new: bool = True, + index_search: bool = True, + ) -> SyncedMarkdownFile: + """Sync one markdown file and return the final canonical file state. - file_content = await self.file_service.read_file_content(path) - file_contains_frontmatter = has_frontmatter(file_content) + This method is the fail-fast single-file primitive for callers such as + cloud workers. It does not swallow unexpected exceptions. + """ + logger.debug(f"Parsing markdown file, path: {path}, new: {new}") - # Get file timestamps for tracking modification times + initial_markdown_content = await self.file_service.read_file_content(path) file_metadata = await self.file_service.get_file_metadata(path) - created = file_metadata.created_at - modified = file_metadata.modified_at - - # Parse markdown content with file metadata (avoids redundant file read/stat) - # This enables cloud implementations (S3FileService) to provide metadata from head_object - abs_path = self.file_service.base_path / path - entity_markdown = await self.entity_parser.parse_markdown_content( - file_path=abs_path, - content=file_content, - mtime=file_metadata.modified_at.timestamp(), - ctime=file_metadata.created_at.timestamp(), + initial_checksum = await compute_checksum(initial_markdown_content) + indexed = await self.batch_indexer.index_markdown_file( + IndexInputFile( + path=path, + size=file_metadata.size, + checksum=initial_checksum, + content_type=self.file_service.content_type(path), + last_modified=file_metadata.modified_at, + created_at=file_metadata.created_at, + content=initial_markdown_content.encode("utf-8"), + ), + new=new, + index_search=False, ) - - # Trigger: markdown file has no frontmatter and frontmatter enforcement is enabled - # Why: watch/sync consumers rely on normalized metadata and stable permalinks - # Outcome: file is updated in-place with derived title/type/permalink metadata - if not file_contains_frontmatter and self.app_config.ensure_frontmatter_on_sync: - permalink = await self.entity_service.resolve_permalink( - path, markdown=entity_markdown, skip_conflict_check=True - ) - frontmatter_updates = { - "title": entity_markdown.frontmatter.title, - "type": entity_markdown.frontmatter.type, - "permalink": permalink, - } - await self.file_service.update_frontmatter(path, frontmatter_updates) - entity_markdown.frontmatter.metadata.update(frontmatter_updates) - - # if the file contains frontmatter, resolve a permalink (unless disabled) - if file_contains_frontmatter and not self.app_config.disable_permalinks: - # Resolve permalink - skip conflict checks during bulk sync for performance - permalink = await self.entity_service.resolve_permalink( - path, markdown=entity_markdown, skip_conflict_check=True - ) - - # If permalink changed, update the file - if permalink != entity_markdown.frontmatter.permalink: - logger.debug( - f"Updating permalink for path: {path}, old_permalink: {entity_markdown.frontmatter.permalink}, new_permalink: {permalink}" - ) - - entity_markdown.frontmatter.metadata["permalink"] = permalink - await self.file_service.update_frontmatter(path, {"permalink": permalink}) - - # Create/update entity and relations in one path - logger.debug(f"{'Creating' if new else 'Updating'} entity from markdown, path={path}") - entity = await self.entity_service.upsert_entity_from_markdown( - Path(path), entity_markdown, is_new=new + final_markdown_content = ( + indexed.markdown_content + if indexed.markdown_content is not None + else initial_markdown_content ) - - # After updating relations, we need to compute the checksum again - # This is necessary for files with wikilinks to ensure consistent checksums - # after relation processing is complete - final_checksum = await self.file_service.compute_checksum(path) - - # Update checksum, timestamps, and file metadata from file system - # Store mtime/size for efficient change detection in future scans - # This ensures temporal ordering in search and recent activity uses actual file modification times - await self.entity_repository.update( - entity.id, + file_metadata = await self.file_service.get_file_metadata(path) + refreshed_entities = await self.entity_repository.find_by_ids([indexed.entity_id]) + if len(refreshed_entities) != 1: # pragma: no cover + raise ValueError(f"Failed to reload synced markdown entity for {path}") + updated_entity = await self.entity_repository.update( + refreshed_entities[0].id, { - "checksum": final_checksum, - "created_at": created, - "updated_at": modified, + "checksum": indexed.checksum, + "created_at": file_metadata.created_at, + "updated_at": file_metadata.modified_at, "mtime": file_metadata.modified_at.timestamp(), "size": file_metadata.size, }, ) + if updated_entity is None: # pragma: no cover + raise ValueError(f"Failed to update markdown entity metadata for {path}") + + if index_search: + await self.search_service.index_entity_data( + updated_entity, + content=remove_frontmatter(final_markdown_content), + ) logger.debug( - f"Markdown sync completed: path={path}, entity_id={entity.id}, " - f"observation_count={len(entity.observations)}, relation_count={len(entity.relations)}, " - f"checksum={final_checksum[:8]}" + f"Markdown sync completed: path={path}, entity_id={updated_entity.id}, " + f"observation_count={len(updated_entity.observations)}, " + f"relation_count={len(updated_entity.relations)}, checksum={indexed.checksum[:8]}" ) - # Return the final checksum to ensure everything is consistent - return entity, final_checksum + return SyncedMarkdownFile( + entity=updated_entity, + checksum=indexed.checksum, + markdown_content=final_markdown_content, + file_path=path, + content_type=self.file_service.content_type(path), + updated_at=file_metadata.modified_at, + size=file_metadata.size, + ) async def sync_regular_file(self, path: str, new: bool = True) -> Tuple[Optional[Entity], str]: """Sync a non-markdown file with basic tracking. diff --git a/tests/indexing/test_batch_indexer.py b/tests/indexing/test_batch_indexer.py index e1371d92..0579c217 100644 --- a/tests/indexing/test_batch_indexer.py +++ b/tests/indexing/test_batch_indexer.py @@ -15,6 +15,7 @@ IndexFrontmatterWriteResult, IndexInputFile, ) +from basic_memory.schemas import Entity as EntitySchema from basic_memory.services.exceptions import SyncFatalError @@ -561,3 +562,65 @@ async def fatal_worker(path: str) -> str: limit=1, worker=fatal_worker, ) + + +@pytest.mark.asyncio +async def test_batch_indexer_index_markdown_file_rewrites_permalink_after_repository_conflict( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + project_config, + monkeypatch, +): + existing = await entity_service.create_entity_with_content( + EntitySchema( + title="Existing Note", + directory="notes", + content="# Existing Note\n\nOriginal content.\n", + ) + ) + conflicting_permalink = existing.entity.permalink + assert conflicting_permalink is not None + + path = "notes/race.md" + await _create_file( + project_config.home / path, + dedent( + f"""\ + --- + title: Race Note + type: note + permalink: {conflicting_permalink} + --- + + # Race Note + + Body content. + """ + ), + ) + + async def stale_permalink(*args, **kwargs) -> str: + return conflicting_permalink + + batch_indexer = _make_batch_indexer( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + ) + + monkeypatch.setattr(entity_service, "resolve_permalink", stale_permalink) + indexed = await batch_indexer.index_markdown_file( + await _load_input(file_service, path), + index_search=False, + ) + + persisted_content = await file_service.read_file_content(path) + assert indexed.permalink == f"{conflicting_permalink}-1" + assert indexed.markdown_content == persisted_content diff --git a/tests/sync/test_sync_one_markdown_file.py b/tests/sync/test_sync_one_markdown_file.py new file mode 100644 index 00000000..9a76bfb2 --- /dev/null +++ b/tests/sync/test_sync_one_markdown_file.py @@ -0,0 +1,218 @@ +"""Focused tests for the one-file markdown sync primitive.""" + +from pathlib import Path +from textwrap import dedent +from unittest.mock import AsyncMock + +import pytest + +from basic_memory.file_utils import compute_checksum, remove_frontmatter +from basic_memory.schemas import Entity as EntitySchema + + +def _write_markdown(project_root: Path, relative_path: str, content: str) -> Path: + """Create one markdown file under the test project.""" + file_path = project_root / relative_path + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(content, encoding="utf-8") + return file_path + + +@pytest.mark.asyncio +async def test_sync_one_markdown_file_writes_missing_frontmatter_and_returns_canonical_content( + sync_service, + test_project, + app_config, + monkeypatch, +): + """Missing frontmatter is written once and returned exactly as stored on disk.""" + app_config.ensure_frontmatter_on_sync = True + file_path = _write_markdown( + Path(test_project.path), + "notes/frontmatterless.md", + "# Frontmatterless\n\nBody content.\n", + ) + + index_entity_data = AsyncMock() + monkeypatch.setattr(sync_service.search_service, "index_entity_data", index_entity_data) + + result = await sync_service.sync_one_markdown_file("notes/frontmatterless.md") + + final_content = file_path.read_text(encoding="utf-8") + assert result.markdown_content == final_content + assert result.entity.permalink == f"{test_project.name}/notes/frontmatterless" + assert f"permalink: {result.entity.permalink}" in final_content + assert result.checksum == await sync_service.file_service.compute_checksum( + "notes/frontmatterless.md" + ) + assert result.size == file_path.stat().st_size + index_entity_data.assert_awaited_once_with( + result.entity, + content=remove_frontmatter(final_content), + ) + + +@pytest.mark.asyncio +async def test_sync_one_markdown_file_rewrites_permalink_once_after_repository_conflict( + sync_service, + entity_service, + test_project, + monkeypatch, +): + """Late DB conflict resolution updates the file exactly once with the accepted permalink.""" + existing = await entity_service.create_entity_with_content( + EntitySchema( + title="Existing Note", + directory="notes", + content="# Existing Note\n\nOriginal content.\n", + ) + ) + conflicting_permalink = existing.entity.permalink + assert conflicting_permalink is not None + + file_path = _write_markdown( + Path(test_project.path), + "notes/race.md", + dedent( + f"""\ + --- + title: Race Note + type: note + permalink: {conflicting_permalink} + --- + + # Race Note + + Body content. + """ + ), + ) + + async def stale_permalink(*args, **kwargs) -> str: + return conflicting_permalink + + original_writer = sync_service.file_service.update_frontmatter_with_result + frontmatter_writer = AsyncMock(side_effect=original_writer) + monkeypatch.setattr(sync_service.entity_service, "resolve_permalink", stale_permalink) + monkeypatch.setattr( + sync_service.file_service, + "update_frontmatter_with_result", + frontmatter_writer, + ) + + result = await sync_service.sync_one_markdown_file("notes/race.md", index_search=False) + + final_content = file_path.read_text(encoding="utf-8") + assert frontmatter_writer.await_count == 1 + assert result.entity.permalink == f"{conflicting_permalink}-1" + assert result.markdown_content == final_content + assert f"permalink: {result.entity.permalink}" in final_content + assert result.checksum == await sync_service.file_service.compute_checksum("notes/race.md") + + +@pytest.mark.asyncio +async def test_sync_one_markdown_file_returns_original_content_when_no_rewrite_needed( + sync_service, + test_project, + monkeypatch, +): + """Canonical markdown is returned as-read when frontmatter already matches.""" + original_content = dedent( + f"""\ + --- + title: No Rewrite + type: note + permalink: {test_project.name}/notes/no-rewrite + --- + + # No Rewrite + + Body content. + """ + ) + file_path = _write_markdown( + Path(test_project.path), + "notes/no-rewrite.md", + original_content, + ) + + original_writer = sync_service.file_service.update_frontmatter_with_result + frontmatter_writer = AsyncMock(side_effect=original_writer) + monkeypatch.setattr( + sync_service.file_service, + "update_frontmatter_with_result", + frontmatter_writer, + ) + + result = await sync_service.sync_one_markdown_file("notes/no-rewrite.md", index_search=False) + + assert frontmatter_writer.await_count == 0 + assert result.markdown_content == original_content + assert file_path.read_text(encoding="utf-8") == original_content + assert result.checksum == await sync_service.file_service.compute_checksum( + "notes/no-rewrite.md" + ) + + +@pytest.mark.asyncio +async def test_sync_one_markdown_file_does_not_reread_for_initial_checksum_when_no_rewrite( + sync_service, + test_project, + monkeypatch, +): + """Initial checksum comes from the loaded content, not a second storage read.""" + original_content = dedent( + f"""\ + --- + title: No Rewrite + type: note + permalink: {test_project.name}/notes/no-rewrite + --- + + # No Rewrite + + Body content. + """ + ) + _write_markdown( + Path(test_project.path), + "notes/no-rewrite.md", + original_content, + ) + + checksum_spy = AsyncMock() + monkeypatch.setattr(sync_service.file_service, "compute_checksum", checksum_spy) + + result = await sync_service.sync_one_markdown_file("notes/no-rewrite.md", index_search=False) + + checksum_spy.assert_not_awaited() + assert result.checksum == await compute_checksum(original_content) + + +@pytest.mark.asyncio +async def test_sync_markdown_file_remains_tuple_compatible(sync_service, test_project): + """The legacy tuple-returning API still works for existing callers.""" + _write_markdown( + Path(test_project.path), + "notes/compat.md", + dedent( + f"""\ + --- + title: Compat Note + type: note + permalink: {test_project.name}/notes/compat + --- + + # Compat Note + + Body content. + """ + ), + ) + + entity, checksum = await sync_service.sync_markdown_file("notes/compat.md") + + assert entity is not None + assert entity.file_path == "notes/compat.md" + assert entity.permalink == f"{test_project.name}/notes/compat" + assert checksum == await sync_service.file_service.compute_checksum("notes/compat.md") From ee9281209f826d422f45af96e1748236c7da56cc Mon Sep 17 00:00:00 2001 From: phernandez Date: Wed, 15 Apr 2026 19:07:25 -0500 Subject: [PATCH 2/4] fix(sync): strip frontmatter-only search content Signed-off-by: phernandez --- src/basic_memory/indexing/batch_indexer.py | 4 +- src/basic_memory/sync/sync_service.py | 3 ++ tests/indexing/test_batch_indexer.py | 53 ++++++++++++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/basic_memory/indexing/batch_indexer.py b/src/basic_memory/indexing/batch_indexer.py index 1a9c796c..bd948e68 100644 --- a/src/basic_memory/indexing/batch_indexer.py +++ b/src/basic_memory/indexing/batch_indexer.py @@ -12,7 +12,7 @@ from sqlalchemy.exc import IntegrityError from basic_memory.config import BasicMemoryConfig -from basic_memory.file_utils import compute_checksum, has_frontmatter +from basic_memory.file_utils import compute_checksum, has_frontmatter, remove_frontmatter from basic_memory.markdown.schemas import EntityMarkdown from basic_memory.indexing.models import ( IndexedEntity, @@ -604,7 +604,7 @@ def _build_prepared_entity( search_content=( prepared.markdown.content if prepared.markdown.content is not None - else prepared.content + else remove_frontmatter(prepared.content) ), markdown_content=prepared.content, ) diff --git a/src/basic_memory/sync/sync_service.py b/src/basic_memory/sync/sync_service.py index c7f2f083..8bd2601b 100644 --- a/src/basic_memory/sync/sync_service.py +++ b/src/basic_memory/sync/sync_service.py @@ -1100,6 +1100,9 @@ async def sync_one_markdown_file( refreshed_entities = await self.entity_repository.find_by_ids([indexed.entity_id]) if len(refreshed_entities) != 1: # pragma: no cover raise ValueError(f"Failed to reload synced markdown entity for {path}") + # Trigger: markdown sync may have rewritten frontmatter after the initial file metadata load. + # Why: the batch indexer persisted checksum/path data from the pre-rewrite IndexInputFile. + # Outcome: refresh size and mtime from the file as it actually exists on disk now. updated_entity = await self.entity_repository.update( refreshed_entities[0].id, { diff --git a/tests/indexing/test_batch_indexer.py b/tests/indexing/test_batch_indexer.py index 0579c217..6bd8cf98 100644 --- a/tests/indexing/test_batch_indexer.py +++ b/tests/indexing/test_batch_indexer.py @@ -5,10 +5,12 @@ import asyncio from pathlib import Path from textwrap import dedent +from unittest.mock import AsyncMock import pytest from sqlalchemy import text +from basic_memory.file_utils import remove_frontmatter from basic_memory.indexing import ( BatchIndexer, IndexFrontmatterUpdate, @@ -624,3 +626,54 @@ async def stale_permalink(*args, **kwargs) -> str: persisted_content = await file_service.read_file_content(path) assert indexed.permalink == f"{conflicting_permalink}-1" assert indexed.markdown_content == persisted_content + + +@pytest.mark.asyncio +async def test_batch_indexer_strips_frontmatter_from_search_content_when_body_is_empty( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + project_config, + monkeypatch, +): + path = "notes/frontmatter-only.md" + await _create_file( + project_config.home / path, + dedent( + """ + --- + title: Frontmatter Only + type: note + status: draft + --- + """ + ).strip(), + ) + + index_entity_data = AsyncMock() + monkeypatch.setattr(search_service, "index_entity_data", index_entity_data) + batch_indexer = _make_batch_indexer( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + ) + + await batch_indexer.index_markdown_file( + await _load_input(file_service, path), index_search=True + ) + + persisted_content = await file_service.read_file_content(path) + entity = await entity_repository.get_by_file_path(path) + assert entity is not None + index_entity_data.assert_awaited_once() + await_args = index_entity_data.await_args + assert await_args is not None + args, kwargs = await_args + assert args[0].id == entity.id + assert kwargs["content"] == remove_frontmatter(persisted_content) From 8ef292de2982c8964d638470f4a431b691733539 Mon Sep 17 00:00:00 2001 From: phernandez Date: Wed, 15 Apr 2026 19:30:10 -0500 Subject: [PATCH 3/4] fix(sync): hash one-file markdown from raw bytes Signed-off-by: phernandez --- src/basic_memory/sync/sync_service.py | 15 ++++++++++++--- tests/sync/test_sync_one_markdown_file.py | 6 +++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/basic_memory/sync/sync_service.py b/src/basic_memory/sync/sync_service.py index 8bd2601b..b9506dfd 100644 --- a/src/basic_memory/sync/sync_service.py +++ b/src/basic_memory/sync/sync_service.py @@ -1075,9 +1075,18 @@ async def sync_one_markdown_file( """ logger.debug(f"Parsing markdown file, path: {path}, new: {new}") - initial_markdown_content = await self.file_service.read_file_content(path) + try: + initial_markdown_bytes = await self.file_service.read_file_bytes(path) + except FileOperationError as exc: + # Trigger: FileService wraps binary read failures in FileOperationError. + # Why: sync_file() treats bare FileNotFoundError as a deletion race and cleans up the DB row. + # Outcome: preserve that contract while still hashing the exact bytes we loaded. + if isinstance(exc.__cause__, FileNotFoundError): + raise exc.__cause__ from exc + raise + initial_markdown_content = initial_markdown_bytes.decode("utf-8") file_metadata = await self.file_service.get_file_metadata(path) - initial_checksum = await compute_checksum(initial_markdown_content) + initial_checksum = await compute_checksum(initial_markdown_bytes) indexed = await self.batch_indexer.index_markdown_file( IndexInputFile( path=path, @@ -1086,7 +1095,7 @@ async def sync_one_markdown_file( content_type=self.file_service.content_type(path), last_modified=file_metadata.modified_at, created_at=file_metadata.created_at, - content=initial_markdown_content.encode("utf-8"), + content=initial_markdown_bytes, ), new=new, index_search=False, diff --git a/tests/sync/test_sync_one_markdown_file.py b/tests/sync/test_sync_one_markdown_file.py index 9a76bfb2..9e3536ec 100644 --- a/tests/sync/test_sync_one_markdown_file.py +++ b/tests/sync/test_sync_one_markdown_file.py @@ -160,7 +160,7 @@ async def test_sync_one_markdown_file_does_not_reread_for_initial_checksum_when_ test_project, monkeypatch, ): - """Initial checksum comes from the loaded content, not a second storage read.""" + """Initial checksum comes from the loaded file bytes, not a second storage read.""" original_content = dedent( f"""\ --- @@ -174,7 +174,7 @@ async def test_sync_one_markdown_file_does_not_reread_for_initial_checksum_when_ Body content. """ ) - _write_markdown( + file_path = _write_markdown( Path(test_project.path), "notes/no-rewrite.md", original_content, @@ -186,7 +186,7 @@ async def test_sync_one_markdown_file_does_not_reread_for_initial_checksum_when_ result = await sync_service.sync_one_markdown_file("notes/no-rewrite.md", index_search=False) checksum_spy.assert_not_awaited() - assert result.checksum == await compute_checksum(original_content) + assert result.checksum == await compute_checksum(file_path.read_bytes()) @pytest.mark.asyncio From 1bb42dfb1bad3b4cc1700340ffcdb129f3daa757 Mon Sep 17 00:00:00 2001 From: phernandez Date: Wed, 15 Apr 2026 19:44:45 -0500 Subject: [PATCH 4/4] fix(sync): preserve frontmatterless markdown sync Signed-off-by: phernandez --- src/basic_memory/indexing/batch_indexer.py | 8 +++ src/basic_memory/sync/sync_service.py | 13 ++++- tests/indexing/test_batch_indexer.py | 58 ++++++++++++++++++++++ tests/sync/test_sync_one_markdown_file.py | 41 ++++++++++++++- 4 files changed, 116 insertions(+), 4 deletions(-) diff --git a/src/basic_memory/indexing/batch_indexer.py b/src/basic_memory/indexing/batch_indexer.py index bd948e68..2535d6c0 100644 --- a/src/basic_memory/indexing/batch_indexer.py +++ b/src/basic_memory/indexing/batch_indexer.py @@ -562,8 +562,16 @@ async def _reconcile_persisted_permalink( prepared: _PreparedMarkdownFile, entity: Entity, ) -> _PreparedMarkdownFile: + # Trigger: the source file started without frontmatter and sync is configured + # to leave frontmatterless files alone. + # Why: upsert may still assign a DB permalink even when disk content should stay untouched. + # Outcome: skip reconciliation writes that would silently inject frontmatter. if ( self.app_config.disable_permalinks + or ( + not prepared.file_contains_frontmatter + and not self.app_config.ensure_frontmatter_on_sync + ) or entity.permalink is None or entity.permalink == prepared.markdown.frontmatter.permalink ): diff --git a/src/basic_memory/sync/sync_service.py b/src/basic_memory/sync/sync_service.py index b9506dfd..945add2a 100644 --- a/src/basic_memory/sync/sync_service.py +++ b/src/basic_memory/sync/sync_service.py @@ -18,7 +18,7 @@ from basic_memory import telemetry from basic_memory import db from basic_memory.config import BasicMemoryConfig, ConfigManager -from basic_memory.file_utils import compute_checksum, remove_frontmatter +from basic_memory.file_utils import ParseError, compute_checksum, remove_frontmatter from basic_memory.indexing import ( BatchIndexer, IndexFileMetadata, @@ -1126,9 +1126,18 @@ async def sync_one_markdown_file( raise ValueError(f"Failed to update markdown entity metadata for {path}") if index_search: + # Trigger: markdown may start with '---' as a thematic break or malformed + # frontmatter that the parser already treated as plain content. + # Why: one-file sync should not fail after the entity upsert just because + # strict frontmatter stripping rejects that exact text shape. + # Outcome: fall back to indexing the raw markdown content for these cases. + try: + search_content = remove_frontmatter(final_markdown_content) + except ParseError: + search_content = final_markdown_content await self.search_service.index_entity_data( updated_entity, - content=remove_frontmatter(final_markdown_content), + content=search_content, ) logger.debug( diff --git a/tests/indexing/test_batch_indexer.py b/tests/indexing/test_batch_indexer.py index 6bd8cf98..23a2e739 100644 --- a/tests/indexing/test_batch_indexer.py +++ b/tests/indexing/test_batch_indexer.py @@ -677,3 +677,61 @@ async def test_batch_indexer_strips_frontmatter_from_search_content_when_body_is args, kwargs = await_args assert args[0].id == entity.id assert kwargs["content"] == remove_frontmatter(persisted_content) + + +@pytest.mark.asyncio +async def test_batch_indexer_does_not_inject_frontmatter_when_sync_enforcement_is_disabled( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + project_config, + monkeypatch, +): + app_config.ensure_frontmatter_on_sync = False + + created = await entity_service.create_entity_with_content( + EntitySchema( + title="Frontmatterless", + directory="notes", + content="# Frontmatterless\n\nOriginal content.\n", + ) + ) + path = created.entity.file_path + assert path is not None + existing_permalink = created.entity.permalink + assert existing_permalink is not None + + original_content = "# Frontmatterless\n\nBody content.\n" + await _create_file(project_config.home / path, original_content) + + original_writer = file_service.update_frontmatter_with_result + frontmatter_writer = AsyncMock(side_effect=original_writer) + monkeypatch.setattr(file_service, "update_frontmatter_with_result", frontmatter_writer) + + batch_indexer = _make_batch_indexer( + app_config, + entity_service, + entity_repository, + relation_repository, + search_service, + file_service, + ) + + indexed = await batch_indexer.index_markdown_file( + await _load_input(file_service, path), + index_search=False, + ) + + # Trigger: Windows persists CRLF for text files even when the test literal uses LF. + # Why: this assertion cares about preserving a frontmatterless file, not about newline style. + # Outcome: compare against the exact content stored on disk after sync. + persisted_content = (project_config.home / path).read_bytes().decode("utf-8") + entity = await entity_repository.get_by_file_path(path) + assert entity is not None + assert entity.permalink == existing_permalink + assert frontmatter_writer.await_count == 0 + assert indexed.markdown_content == persisted_content + assert await file_service.read_file_content(path) == persisted_content diff --git a/tests/sync/test_sync_one_markdown_file.py b/tests/sync/test_sync_one_markdown_file.py index 9e3536ec..2fe09559 100644 --- a/tests/sync/test_sync_one_markdown_file.py +++ b/tests/sync/test_sync_one_markdown_file.py @@ -146,9 +146,14 @@ async def test_sync_one_markdown_file_returns_original_content_when_no_rewrite_n result = await sync_service.sync_one_markdown_file("notes/no-rewrite.md", index_search=False) + # Trigger: Windows persists CRLF for text files even when the test literal uses LF. + # Why: this assertion cares about "no rewrite happened", not about pinning one newline style. + # Outcome: compare against the exact markdown bytes stored on disk. + persisted_content = file_path.read_bytes().decode("utf-8") + assert frontmatter_writer.await_count == 0 - assert result.markdown_content == original_content - assert file_path.read_text(encoding="utf-8") == original_content + assert result.markdown_content == persisted_content + assert file_path.read_text(encoding="utf-8") == persisted_content assert result.checksum == await sync_service.file_service.compute_checksum( "notes/no-rewrite.md" ) @@ -216,3 +221,35 @@ async def test_sync_markdown_file_remains_tuple_compatible(sync_service, test_pr assert entity.file_path == "notes/compat.md" assert entity.permalink == f"{test_project.name}/notes/compat" assert checksum == await sync_service.file_service.compute_checksum("notes/compat.md") + + +@pytest.mark.asyncio +async def test_sync_one_markdown_file_indexes_thematic_break_content_without_frontmatter( + sync_service, + test_project, + app_config, + monkeypatch, +): + """Leading thematic-break markdown should index as raw content when frontmatter is absent.""" + app_config.ensure_frontmatter_on_sync = False + + original_content = "---\nBody content after a thematic break.\n" + file_path = _write_markdown( + Path(test_project.path), + "notes/thematic-break.md", + original_content, + ) + + index_entity_data = AsyncMock() + monkeypatch.setattr(sync_service.search_service, "index_entity_data", index_entity_data) + + result = await sync_service.sync_one_markdown_file("notes/thematic-break.md") + + persisted_content = file_path.read_bytes().decode("utf-8") + + assert result.markdown_content == persisted_content + assert file_path.read_text(encoding="utf-8") == persisted_content + index_entity_data.assert_awaited_once_with( + result.entity, + content=persisted_content, + )