-
Notifications
You must be signed in to change notification settings - Fork 186
fix(sync): preserve canonical markdown in single-file sync #746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ecaf247
ee92812
8ef292d
1bb42df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| from sqlalchemy.exc import IntegrityError | ||
|
|
||
| from basic_memory.config import BasicMemoryConfig | ||
| from basic_memory.file_utils import compute_checksum, has_frontmatter | ||
| from basic_memory.file_utils import compute_checksum, has_frontmatter, remove_frontmatter | ||
| from basic_memory.markdown.schemas import EntityMarkdown | ||
| from basic_memory.indexing.models import ( | ||
| IndexedEntity, | ||
|
|
@@ -43,12 +43,19 @@ class _PreparedMarkdownFile: | |
| class _PreparedEntity: | ||
| path: str | ||
| entity_id: int | ||
| permalink: str | None | ||
| checksum: str | ||
| content_type: str | None | ||
| search_content: str | None | ||
| markdown_content: str | None = None | ||
|
|
||
|
|
||
| @dataclass(slots=True) | ||
| class _PersistedMarkdownFile: | ||
| prepared: _PreparedMarkdownFile | ||
| entity: Entity | ||
|
|
||
|
|
||
| class BatchIndexer: | ||
| """Index already-loaded files without assuming where they came from.""" | ||
|
|
||
|
|
@@ -118,6 +125,9 @@ async def index_files( | |
| ) | ||
| error_by_path.update(markdown_errors) | ||
| prepared_entities.update(markdown_upserts) | ||
| if existing_permalink_by_path is not None: | ||
| for path, prepared_entity in markdown_upserts.items(): | ||
| existing_permalink_by_path[path] = prepared_entity.permalink | ||
|
|
||
| regular_upserts, regular_errors = await self._run_bounded( | ||
| regular_paths, | ||
|
|
@@ -168,6 +178,57 @@ async def index_files( | |
| search_indexed=search_indexed, | ||
| ) | ||
|
|
||
| async def index_markdown_file( | ||
| self, | ||
| file: IndexInputFile, | ||
| *, | ||
| new: bool | None = None, | ||
| existing_permalink_by_path: dict[str, str | None] | None = None, | ||
| index_search: bool = True, | ||
| ) -> IndexedEntity: | ||
| """Index one markdown file using the same normalization and upsert path as batches.""" | ||
| if not self._is_markdown(file): | ||
| raise ValueError(f"index_markdown_file requires markdown input: {file.path}") | ||
|
|
||
| prepared = await self._prepare_markdown_file(file) | ||
| if existing_permalink_by_path is None: | ||
| existing_permalink_by_path = { | ||
| path: permalink | ||
| for path, permalink in ( | ||
| await self.entity_repository.get_file_path_to_permalink_map() | ||
| ).items() | ||
| } | ||
|
|
||
| reserved_permalinks = { | ||
| permalink | ||
| for path, permalink in existing_permalink_by_path.items() | ||
| if path != file.path and permalink | ||
| } | ||
| prepared = await self._normalize_markdown_file(prepared, reserved_permalinks) | ||
| existing_permalink_by_path[file.path] = prepared.markdown.frontmatter.permalink | ||
|
|
||
| persisted = await self._persist_markdown_file(prepared, is_new=new) | ||
| existing_permalink_by_path[file.path] = persisted.entity.permalink | ||
| await self._resolve_batch_relations([persisted.entity.id], max_concurrent=1) | ||
|
|
||
| refreshed = await self.entity_repository.find_by_ids([persisted.entity.id]) | ||
| if len(refreshed) != 1: # pragma: no cover | ||
| raise ValueError(f"Failed to reload indexed entity for {file.path}") | ||
| entity = refreshed[0] | ||
| prepared_entity = self._build_prepared_entity(persisted.prepared, entity) | ||
|
|
||
| if index_search: | ||
| return await self._refresh_search_index(prepared_entity, entity) | ||
|
|
||
| return IndexedEntity( | ||
| path=prepared_entity.path, | ||
| entity_id=entity.id, | ||
| permalink=entity.permalink, | ||
| checksum=prepared_entity.checksum, | ||
| content_type=prepared_entity.content_type, | ||
| markdown_content=prepared_entity.markdown_content, | ||
| ) | ||
|
|
||
| # --- Preparation --- | ||
|
|
||
| async def _prepare_markdown_file(self, file: IndexInputFile) -> _PreparedMarkdownFile: | ||
|
|
@@ -320,34 +381,8 @@ def _reserve_batch_permalink( | |
| # --- Persistence --- | ||
|
|
||
| async def _upsert_markdown_file(self, prepared: _PreparedMarkdownFile) -> _PreparedEntity: | ||
| existing = await self.entity_repository.get_by_file_path( | ||
| prepared.file.path, | ||
| load_relations=False, | ||
| ) | ||
| entity = await self.entity_service.upsert_entity_from_markdown( | ||
| Path(prepared.file.path), | ||
| prepared.markdown, | ||
| is_new=existing is None, | ||
| ) | ||
| updated = await self.entity_repository.update( | ||
| entity.id, | ||
| self._entity_metadata_updates(prepared.file, prepared.final_checksum), | ||
| ) | ||
| if updated is None: | ||
| raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}") | ||
|
|
||
| return _PreparedEntity( | ||
| path=prepared.file.path, | ||
| entity_id=updated.id, | ||
| checksum=prepared.final_checksum, | ||
| content_type=prepared.file.content_type, | ||
| search_content=( | ||
| prepared.markdown.content | ||
| if prepared.markdown.content is not None | ||
| else prepared.content | ||
| ), | ||
| markdown_content=prepared.content, | ||
| ) | ||
| persisted = await self._persist_markdown_file(prepared) | ||
| return self._build_prepared_entity(persisted.prepared, persisted.entity) | ||
|
|
||
| async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity: | ||
| checksum = await self._resolve_checksum(file) | ||
|
|
@@ -405,6 +440,7 @@ async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity: | |
| return _PreparedEntity( | ||
| path=file.path, | ||
| entity_id=updated.id, | ||
| permalink=updated.permalink, | ||
| checksum=checksum, | ||
| content_type=file.content_type, | ||
| search_content=None, | ||
|
|
@@ -495,6 +531,92 @@ async def _refresh_search_index( | |
|
|
||
| # --- Helpers --- | ||
|
|
||
| async def _persist_markdown_file( | ||
| self, | ||
| prepared: _PreparedMarkdownFile, | ||
| *, | ||
| is_new: bool | None = None, | ||
| ) -> _PersistedMarkdownFile: | ||
| existing = await self.entity_repository.get_by_file_path( | ||
| prepared.file.path, | ||
| load_relations=False, | ||
| ) | ||
| if is_new is None: | ||
| is_new = existing is None | ||
| entity = await self.entity_service.upsert_entity_from_markdown( | ||
| Path(prepared.file.path), | ||
| prepared.markdown, | ||
| is_new=is_new, | ||
| ) | ||
| prepared = await self._reconcile_persisted_permalink(prepared, entity) | ||
| updated = await self.entity_repository.update( | ||
| entity.id, | ||
| self._entity_metadata_updates(prepared.file, prepared.final_checksum), | ||
| ) | ||
|
Comment on lines
+552
to
+555
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Useful? React with 👍 / 👎. |
||
| if updated is None: | ||
| raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}") | ||
| return _PersistedMarkdownFile(prepared=prepared, entity=updated) | ||
|
|
||
| async def _reconcile_persisted_permalink( | ||
| self, | ||
| prepared: _PreparedMarkdownFile, | ||
| entity: Entity, | ||
| ) -> _PreparedMarkdownFile: | ||
| # Trigger: the source file started without frontmatter and sync is configured | ||
| # to leave frontmatterless files alone. | ||
| # Why: upsert may still assign a DB permalink even when disk content should stay untouched. | ||
| # Outcome: skip reconciliation writes that would silently inject frontmatter. | ||
| if ( | ||
| self.app_config.disable_permalinks | ||
| or ( | ||
| not prepared.file_contains_frontmatter | ||
| and not self.app_config.ensure_frontmatter_on_sync | ||
| ) | ||
| or entity.permalink is None | ||
| or entity.permalink == prepared.markdown.frontmatter.permalink | ||
| ): | ||
|
Comment on lines
+569
to
+577
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This reconciliation branch rewrites frontmatter whenever the persisted entity permalink differs, but it does not check whether the source file had frontmatter. If Useful? React with 👍 / 👎. |
||
| return prepared | ||
|
|
||
| logger.debug( | ||
| "Updating permalink after upsert conflict resolution", | ||
| path=prepared.file.path, | ||
| old_permalink=prepared.markdown.frontmatter.permalink, | ||
| new_permalink=entity.permalink, | ||
| ) | ||
| prepared.markdown.frontmatter.metadata["permalink"] = entity.permalink | ||
| write_result = await self.file_writer.write_frontmatter( | ||
| IndexFrontmatterUpdate( | ||
| path=prepared.file.path, | ||
| metadata={"permalink": entity.permalink}, | ||
| ) | ||
| ) | ||
| return _PreparedMarkdownFile( | ||
| file=prepared.file, | ||
| content=write_result.content, | ||
| final_checksum=write_result.checksum, | ||
| markdown=prepared.markdown, | ||
| file_contains_frontmatter=prepared.file_contains_frontmatter, | ||
| ) | ||
|
|
||
| def _build_prepared_entity( | ||
| self, | ||
| prepared: _PreparedMarkdownFile, | ||
| entity: Entity, | ||
| ) -> _PreparedEntity: | ||
| return _PreparedEntity( | ||
| path=prepared.file.path, | ||
| entity_id=entity.id, | ||
| permalink=entity.permalink, | ||
| checksum=prepared.final_checksum, | ||
| content_type=prepared.file.content_type, | ||
| search_content=( | ||
| prepared.markdown.content | ||
| if prepared.markdown.content is not None | ||
| else remove_frontmatter(prepared.content) | ||
| ), | ||
| markdown_content=prepared.content, | ||
| ) | ||
|
|
||
| async def _resolve_checksum(self, file: IndexInputFile) -> str: | ||
| if file.checksum is not None: | ||
| return file.checksum | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
index_markdown_filerebuildsexisting_permalink_by_pathfromget_file_path_to_permalink_map()whenever the caller does not pass a map, andsync_one_markdown_fileinvokes this path for each markdown file sync. In incremental/watch workloads (which callsync_fileper changed file), this adds a full-entity-table scan to every single-file update, creating an O(N) DB read per event and causing avoidable slowdowns on larger projects. A single-file fast path (or a shared cached map across calls) would prevent this regression.Useful? React with 👍 / 👎.