Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .claude/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"env": {
"CLAUDE_BASH_MAINTAIN_PROJECT_WORKING_DIR": "1",
"CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY": "1",
"DISABLE_TELEMETRY": "1",
"CLAUDE_CODE_NO_FLICKER": "1",
"CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING": "1"
},
Expand Down
2 changes: 2 additions & 0 deletions src/basic_memory/indexing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
IndexingBatchResult,
IndexInputFile,
IndexProgress,
SyncedMarkdownFile,
)

__all__ = [
Expand All @@ -25,5 +26,6 @@
"IndexingBatchResult",
"IndexInputFile",
"IndexProgress",
"SyncedMarkdownFile",
"build_index_batches",
]
180 changes: 151 additions & 29 deletions src/basic_memory/indexing/batch_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sqlalchemy.exc import IntegrityError

from basic_memory.config import BasicMemoryConfig
from basic_memory.file_utils import compute_checksum, has_frontmatter
from basic_memory.file_utils import compute_checksum, has_frontmatter, remove_frontmatter
from basic_memory.markdown.schemas import EntityMarkdown
from basic_memory.indexing.models import (
IndexedEntity,
Expand Down Expand Up @@ -43,12 +43,19 @@ class _PreparedMarkdownFile:
class _PreparedEntity:
path: str
entity_id: int
permalink: str | None
checksum: str
content_type: str | None
search_content: str | None
markdown_content: str | None = None


@dataclass(slots=True)
class _PersistedMarkdownFile:
prepared: _PreparedMarkdownFile
entity: Entity


class BatchIndexer:
"""Index already-loaded files without assuming where they came from."""

Expand Down Expand Up @@ -118,6 +125,9 @@ async def index_files(
)
error_by_path.update(markdown_errors)
prepared_entities.update(markdown_upserts)
if existing_permalink_by_path is not None:
for path, prepared_entity in markdown_upserts.items():
existing_permalink_by_path[path] = prepared_entity.permalink

regular_upserts, regular_errors = await self._run_bounded(
regular_paths,
Expand Down Expand Up @@ -168,6 +178,57 @@ async def index_files(
search_indexed=search_indexed,
)

async def index_markdown_file(
self,
file: IndexInputFile,
*,
new: bool | None = None,
existing_permalink_by_path: dict[str, str | None] | None = None,
index_search: bool = True,
) -> IndexedEntity:
"""Index one markdown file using the same normalization and upsert path as batches."""
if not self._is_markdown(file):
raise ValueError(f"index_markdown_file requires markdown input: {file.path}")

prepared = await self._prepare_markdown_file(file)
if existing_permalink_by_path is None:
existing_permalink_by_path = {
path: permalink
for path, permalink in (
await self.entity_repository.get_file_path_to_permalink_map()
Comment on lines +194 to +198
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid full permalink scans in one-file sync

index_markdown_file rebuilds existing_permalink_by_path from get_file_path_to_permalink_map() whenever the caller does not pass a map, and sync_one_markdown_file invokes this path for each markdown file sync. In incremental/watch workloads (which call sync_file per changed file), this adds a full-entity-table scan to every single-file update, creating an O(N) DB read per event and causing avoidable slowdowns on larger projects. A single-file fast path (or a shared cached map across calls) would prevent this regression.

Useful? React with 👍 / 👎.

).items()
}

reserved_permalinks = {
permalink
for path, permalink in existing_permalink_by_path.items()
if path != file.path and permalink
}
prepared = await self._normalize_markdown_file(prepared, reserved_permalinks)
existing_permalink_by_path[file.path] = prepared.markdown.frontmatter.permalink

persisted = await self._persist_markdown_file(prepared, is_new=new)
existing_permalink_by_path[file.path] = persisted.entity.permalink
await self._resolve_batch_relations([persisted.entity.id], max_concurrent=1)

refreshed = await self.entity_repository.find_by_ids([persisted.entity.id])
if len(refreshed) != 1: # pragma: no cover
raise ValueError(f"Failed to reload indexed entity for {file.path}")
entity = refreshed[0]
prepared_entity = self._build_prepared_entity(persisted.prepared, entity)

if index_search:
return await self._refresh_search_index(prepared_entity, entity)

return IndexedEntity(
path=prepared_entity.path,
entity_id=entity.id,
permalink=entity.permalink,
checksum=prepared_entity.checksum,
content_type=prepared_entity.content_type,
markdown_content=prepared_entity.markdown_content,
)

# --- Preparation ---

async def _prepare_markdown_file(self, file: IndexInputFile) -> _PreparedMarkdownFile:
Expand Down Expand Up @@ -320,34 +381,8 @@ def _reserve_batch_permalink(
# --- Persistence ---

async def _upsert_markdown_file(self, prepared: _PreparedMarkdownFile) -> _PreparedEntity:
existing = await self.entity_repository.get_by_file_path(
prepared.file.path,
load_relations=False,
)
entity = await self.entity_service.upsert_entity_from_markdown(
Path(prepared.file.path),
prepared.markdown,
is_new=existing is None,
)
updated = await self.entity_repository.update(
entity.id,
self._entity_metadata_updates(prepared.file, prepared.final_checksum),
)
if updated is None:
raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}")

return _PreparedEntity(
path=prepared.file.path,
entity_id=updated.id,
checksum=prepared.final_checksum,
content_type=prepared.file.content_type,
search_content=(
prepared.markdown.content
if prepared.markdown.content is not None
else prepared.content
),
markdown_content=prepared.content,
)
persisted = await self._persist_markdown_file(prepared)
return self._build_prepared_entity(persisted.prepared, persisted.entity)

async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
checksum = await self._resolve_checksum(file)
Expand Down Expand Up @@ -405,6 +440,7 @@ async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
return _PreparedEntity(
path=file.path,
entity_id=updated.id,
permalink=updated.permalink,
checksum=checksum,
content_type=file.content_type,
search_content=None,
Expand Down Expand Up @@ -495,6 +531,92 @@ async def _refresh_search_index(

# --- Helpers ---

async def _persist_markdown_file(
self,
prepared: _PreparedMarkdownFile,
*,
is_new: bool | None = None,
) -> _PersistedMarkdownFile:
existing = await self.entity_repository.get_by_file_path(
prepared.file.path,
load_relations=False,
)
if is_new is None:
is_new = existing is None
entity = await self.entity_service.upsert_entity_from_markdown(
Path(prepared.file.path),
prepared.markdown,
is_new=is_new,
)
prepared = await self._reconcile_persisted_permalink(prepared, entity)
updated = await self.entity_repository.update(
entity.id,
self._entity_metadata_updates(prepared.file, prepared.final_checksum),
)
Comment on lines +552 to +555
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Refresh metadata after permalink reconciliation writes

When _reconcile_persisted_permalink rewrites frontmatter, the on-disk file mtime/size can change, but _persist_markdown_file still calls _entity_metadata_updates(prepared.file, ...) using the original IndexInputFile metadata from before that write. In the index_files batch path this leaves stale updated_at/mtime/size in the entity row, which causes future scans to repeatedly treat the file as metadata-changed and re-hash it unnecessarily (and can skew recency ordering).

Useful? React with 👍 / 👎.

if updated is None:
raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}")
return _PersistedMarkdownFile(prepared=prepared, entity=updated)

async def _reconcile_persisted_permalink(
self,
prepared: _PreparedMarkdownFile,
entity: Entity,
) -> _PreparedMarkdownFile:
# Trigger: the source file started without frontmatter and sync is configured
# to leave frontmatterless files alone.
# Why: upsert may still assign a DB permalink even when disk content should stay untouched.
# Outcome: skip reconciliation writes that would silently inject frontmatter.
if (
self.app_config.disable_permalinks
or (
not prepared.file_contains_frontmatter
and not self.app_config.ensure_frontmatter_on_sync
)
or entity.permalink is None
or entity.permalink == prepared.markdown.frontmatter.permalink
):
Comment on lines +569 to +577
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Skip permalink rewrite when frontmatter enforcement is off

This reconciliation branch rewrites frontmatter whenever the persisted entity permalink differs, but it does not check whether the source file had frontmatter. If ensure_frontmatter_on_sync is False, syncing an existing frontmatterless note still triggers write_frontmatter because the DB entity has a permalink and parsed frontmatter permalink is None, which unexpectedly mutates files and violates the no-frontmatter-enforcement behavior.

Useful? React with 👍 / 👎.

return prepared

logger.debug(
"Updating permalink after upsert conflict resolution",
path=prepared.file.path,
old_permalink=prepared.markdown.frontmatter.permalink,
new_permalink=entity.permalink,
)
prepared.markdown.frontmatter.metadata["permalink"] = entity.permalink
write_result = await self.file_writer.write_frontmatter(
IndexFrontmatterUpdate(
path=prepared.file.path,
metadata={"permalink": entity.permalink},
)
)
return _PreparedMarkdownFile(
file=prepared.file,
content=write_result.content,
final_checksum=write_result.checksum,
markdown=prepared.markdown,
file_contains_frontmatter=prepared.file_contains_frontmatter,
)

def _build_prepared_entity(
self,
prepared: _PreparedMarkdownFile,
entity: Entity,
) -> _PreparedEntity:
return _PreparedEntity(
path=prepared.file.path,
entity_id=entity.id,
permalink=entity.permalink,
checksum=prepared.final_checksum,
content_type=prepared.file.content_type,
search_content=(
prepared.markdown.content
if prepared.markdown.content is not None
else remove_frontmatter(prepared.content)
),
markdown_content=prepared.content,
)

async def _resolve_checksum(self, file: IndexInputFile) -> str:
if file.checksum is not None:
return file.checksum
Expand Down
18 changes: 17 additions & 1 deletion src/basic_memory/indexing/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Protocol
from typing import Any, Protocol, TYPE_CHECKING

if TYPE_CHECKING: # pragma: no cover
from basic_memory.models import Entity


@dataclass(slots=True)
Expand Down Expand Up @@ -75,6 +78,19 @@ class IndexedEntity:
markdown_content: str | None = None


@dataclass(slots=True)
class SyncedMarkdownFile:
"""Canonical result for syncing one markdown file end-to-end."""

entity: Entity
checksum: str
markdown_content: str
file_path: str
content_type: str
updated_at: datetime
size: int


@dataclass(slots=True)
class IndexingBatchResult:
"""Outcome for one batch execution."""
Expand Down
6 changes: 6 additions & 0 deletions src/basic_memory/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ async def read_file_content(self, path: FilePath) -> str:
logger.warning("File not found", operation="read_file_content", path=str(full_path))
raise
except Exception as e:
if isinstance(e, FileNotFoundError):
logger.warning("File not found", operation="read_file", path=str(full_path))
raise
logger.exception("File read error", path=str(full_path), error=str(e))
raise FileOperationError(f"Failed to read file: {e}")

Expand Down Expand Up @@ -366,6 +369,9 @@ async def read_file(self, path: FilePath) -> Tuple[str, str]:
)
return content, checksum

except FileNotFoundError as e:
logger.warning("File not found", operation="read_file", path=str(full_path))
raise FileOperationError(f"Failed to read file: {e}") from e
except Exception as e:
logger.exception("File read error", path=str(full_path), error=str(e))
raise FileOperationError(f"Failed to read file: {e}")
Expand Down
Loading
Loading