Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions mcp_server/handlers/codebase_analyze_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@

from mcp_server.core.codebase_parser import EXT_TO_LANG, FileAnalysis
from mcp_server.handlers.seed_project_constants import IGNORE_DIRS
from mcp_server.handlers.source_walk import walk_pruned
from mcp_server.infrastructure.memory_store import MemoryStore

CODEBASE_AGENT_CONTEXT = "codebase"
FILE_TAG_PREFIX = "file:"
HASH_TAG_PREFIX = "hash:"

# Bounded-candidate multiplier: we take at most ``max_files * CANDIDATE_MULTIPLIER``
# paths from ``rglob`` before sorting. Source: ADR-0045 §R2 — bounded streaming for
# ingestion paths. The multiplier gives the sort a meaningful candidate set while
# paths from the pruned walk before sorting. Source: ADR-0045 §R2 — bounded streaming
# for ingestion paths. The multiplier gives the sort a meaningful candidate set while
# keeping peak memory O(max_files) instead of O(tree_size).
CANDIDATE_MULTIPLIER = 10

Expand Down Expand Up @@ -90,8 +91,15 @@ def _collect_unbounded(
lang_filter: set[str] | None,
max_bytes: int,
) -> list[Path]:
"""Walk the entire tree, filter, then sort. Memory O(filtered_count)."""
survivors = [p for p in root.rglob("*") if _file_matches(p, lang_filter, max_bytes)]
"""Walk the pruned tree, filter, then sort. Memory O(filtered_count).

Uses ``walk_pruned`` (not ``rglob``) so vendored subtrees in IGNORE_DIRS
are never descended into — a repo carrying a 154M ``deps/`` no longer
stalls the walk for minutes before the extension filter rejects it.
"""
survivors = [
p for p in walk_pruned(root) if _file_matches(p, lang_filter, max_bytes)
]
survivors.sort()
return survivors

Expand All @@ -106,7 +114,7 @@ def _collect_bounded(
then sort for deterministic ordering. See ADR-0045 §R2.
"""
candidate_cap = max(max_files * CANDIDATE_MULTIPLIER, max_files)
candidates = sorted(itertools.islice(root.rglob("*"), candidate_cap))
candidates = sorted(itertools.islice(walk_pruned(root), candidate_cap))

files: list[Path] = []
for path in candidates:
Expand Down
22 changes: 3 additions & 19 deletions mcp_server/handlers/seed_project_stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from __future__ import annotations

import os
from pathlib import Path

from mcp_server.handlers.seed_project_constants import (
Expand All @@ -21,6 +20,7 @@
HEAT_BY_TYPE,
IGNORE_DIRS,
)
from mcp_server.handlers.source_walk import walk_pruned


def heat_for_tags(tags: list[str]) -> float:
Expand All @@ -47,26 +47,10 @@ def _safe_read(path: Path, max_bytes: int = 65536) -> str:
return ""


def _walk_pruned(root: Path):
"""Walk ``root`` skipping IGNORE_DIRS and not following symlinks/junctions.

Uses ``os.walk(followlinks=False)`` with in-place pruning of ``dirnames``
so that ignored directories (node_modules, .venv, __pycache__, etc.) are
never descended into. This is the canonical cross-platform idiom and is
required on Windows to avoid traversing NTFS junctions and reparse points.
Yields ``Path`` objects for every file under the pruned tree.
"""
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
dirnames[:] = [d for d in dirnames if d not in IGNORE_DIRS]
dp = Path(dirpath)
for name in filenames:
yield dp / name


def _detect_languages(root: Path) -> list[str]:
"""Detect primary programming languages from file extensions."""
ext_counts: dict[str, int] = {}
for p in _walk_pruned(root):
for p in walk_pruned(root):
lang = EXT_MAP.get(p.suffix.lower())
if lang:
ext_counts[lang] = ext_counts.get(lang, 0) + 1
Expand Down Expand Up @@ -174,7 +158,7 @@ def stage_docs(root: Path, max_bytes: int) -> list[dict]:
def stage_entry_points(root: Path, max_bytes: int) -> list[dict]:
"""Find and read entry point files."""
discoveries = []
for p in _walk_pruned(root):
for p in walk_pruned(root):
# Skip anything inside dist-info/egg-info build metadata
if any(part.endswith((".dist-info", ".egg-info")) for part in p.parts):
continue
Expand Down
48 changes: 48 additions & 0 deletions mcp_server/handlers/source_walk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Canonical pruned source-tree traversal.

A single home for the ``os.walk`` idiom that skips ignored directories
(``node_modules``, ``.venv``, ``deps``, ``site-packages``, …) by pruning
``dirnames`` in place so they are **never descended into**.

Why this exists: ``Path.rglob("*")`` cannot prune mid-iteration — it
enumerates every entry under an ignored directory and leaves the caller to
reject them afterwards. On a repo carrying a vendored tree (a 154M ``deps/``
of ~8K files, a ``node_modules``), that post-filter walk stalls for minutes
on the event loop. The same asymmetry caused the wiki-drift hang
(``core/wiki_drift.py``); this module is the ingestion-side counterpart.

``os.walk(followlinks=False)`` with ``dirnames[:] = [...]`` is the canonical
cross-platform idiom and is required on Windows to avoid traversing NTFS
junctions and reparse points.
"""

from __future__ import annotations

import os
from collections.abc import Iterator
from pathlib import Path

from mcp_server.handlers.seed_project_constants import IGNORE_DIRS

__all__ = ["walk_pruned"]


def walk_pruned(root: Path) -> Iterator[Path]:
"""Yield every file under ``root`` skipping ``IGNORE_DIRS`` subtrees.

Preconditions:
- ``root`` is an existing directory (a non-directory yields nothing).

Postconditions:
- No path under any directory whose name is in ``IGNORE_DIRS`` is
yielded, and such directories are never descended into — peak work
is O(files in the kept subtree), not O(whole tree).
- Symlinked directories are not followed (``followlinks=False``).
- Yields ``Path`` objects for regular directory entries only;
existence/type of each yielded path is the caller's filter concern.
"""
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
dirnames[:] = [d for d in dirnames if d not in IGNORE_DIRS]
dp = Path(dirpath)
for name in filenames:
yield dp / name
75 changes: 75 additions & 0 deletions tests_py/handlers/test_source_walk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Pruned source-tree traversal — ``walk_pruned``.

Regression coverage for the ingestion-side counterpart of the wiki-drift
hang: a repo carrying a vendored subtree (``deps/``, ``node_modules/``,
``site-packages/``) must never be descended into. ``rglob`` enumerated those
entries before the caller could reject them; ``walk_pruned`` prunes them in
place so the work is bounded to the kept subtree.
"""

from __future__ import annotations

from pathlib import Path

from mcp_server.handlers.seed_project_constants import IGNORE_DIRS
from mcp_server.handlers.source_walk import walk_pruned


def test_yields_kept_files(tmp_path: Path) -> None:
(tmp_path / "pkg").mkdir()
(tmp_path / "pkg" / "mod.py").write_text("x = 1\n")
(tmp_path / "top.py").write_text("y = 2\n")

names = {p.name for p in walk_pruned(tmp_path)}
assert names == {"mod.py", "top.py"}


def test_does_not_descend_into_ignored_dirs(tmp_path: Path) -> None:
"""A file under any IGNORE_DIRS subtree is never yielded."""
(tmp_path / "keep.py").write_text("x = 1\n")
for d in ["deps", "node_modules", "site-packages", ".venv"]:
sub = tmp_path / d / "nested"
sub.mkdir(parents=True)
(sub / "vendored.py").write_text("import this\n")

yielded = list(walk_pruned(tmp_path))
assert [p.name for p in yielded] == ["keep.py"]
assert not any("vendored.py" == p.name for p in yielded)


def test_ignored_dir_nested_deep_is_pruned(tmp_path: Path) -> None:
"""Pruning happens at the ignored dir wherever it appears in the tree."""
deep = tmp_path / "a" / "b" / "node_modules" / "c"
deep.mkdir(parents=True)
(deep / "x.py").write_text("x = 1\n")
(tmp_path / "a" / "real.py").write_text("y = 2\n")

names = {p.name for p in walk_pruned(tmp_path)}
assert names == {"real.py"}


def test_does_not_follow_symlinked_dirs(tmp_path: Path) -> None:
"""``followlinks=False`` — symlinked directories are not traversed."""
real = tmp_path / "real"
real.mkdir()
(real / "f.py").write_text("x = 1\n")
link = tmp_path / "link"
try:
link.symlink_to(real, target_is_directory=True)
except (OSError, NotImplementedError):
return # platform without symlink support — nothing to assert

paths = list(walk_pruned(tmp_path))
# The real file is yielded once (via ``real/``); the symlink is not
# descended into, so we never get a second ``f.py`` via ``link/``.
assert [p.name for p in paths].count("f.py") == 1


def test_empty_for_missing_directory(tmp_path: Path) -> None:
assert list(walk_pruned(tmp_path / "does-not-exist")) == []


def test_constants_cover_flagged_offenders() -> None:
"""The vendored trees that caused the original stalls are ignored."""
for d in ("deps", "site-packages", "node_modules", ".venv", "vendor"):
assert d in IGNORE_DIRS
Loading