From 6e23b5b0b7863952d2e88e54eb8c95e455f00464 Mon Sep 17 00:00:00 2001 From: cdeust Date: Tue, 9 Jun 2026 03:40:29 +0200 Subject: [PATCH] refactor(ingestion): prune vendored dirs in source walk (no rglob descent) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit codebase_analyze's collect_source_files walked the whole tree via root.rglob("*") and rejected IGNORE_DIRS entries only AFTER enumeration. rglob can't prune mid-iteration, so a repo carrying a vendored subtree (a 154M deps/ of ~8K files, node_modules, site-packages) stalled the walk for minutes on the event loop — the same asymmetry that caused the wiki_drift hang (fixed in 619bf9a), here on the ingestion side. - Extract the canonical pruned-walk idiom (os.walk(followlinks=False) + in-place dirnames[:] filter on IGNORE_DIRS) into one module, handlers/source_walk.py::walk_pruned — single source of truth. - Route both _collect_unbounded and _collect_bounded through walk_pruned; ignored subtrees are now never descended into. _file_matches keeps its IGNORE_DIRS/lang/size checks as defense-in-depth. - Replace seed_project_stages' private _walk_pruned with the shared one (removes the duplicate; drops the now-unused os import). Preserves behavior: same files returned, bounded-candidate memory property (ADR-0045 §R2) intact. New tests_py/handlers/test_source_walk.py proves ignored subtrees (deps/node_modules/site-packages/.venv, nested + symlinked) are pruned. 17 passed (incl. existing collector suite). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../handlers/codebase_analyze_helpers.py | 18 +++-- mcp_server/handlers/seed_project_stages.py | 22 +----- mcp_server/handlers/source_walk.py | 48 ++++++++++++ tests_py/handlers/test_source_walk.py | 75 +++++++++++++++++++ 4 files changed, 139 insertions(+), 24 deletions(-) create mode 100644 mcp_server/handlers/source_walk.py create mode 100644 tests_py/handlers/test_source_walk.py diff --git a/mcp_server/handlers/codebase_analyze_helpers.py b/mcp_server/handlers/codebase_analyze_helpers.py index ba3535e7..9f34dc38 100644 --- a/mcp_server/handlers/codebase_analyze_helpers.py +++ b/mcp_server/handlers/codebase_analyze_helpers.py @@ -8,6 +8,7 @@ from mcp_server.core.codebase_parser import EXT_TO_LANG, FileAnalysis from mcp_server.handlers.seed_project_constants import IGNORE_DIRS +from mcp_server.handlers.source_walk import walk_pruned from mcp_server.infrastructure.memory_store import MemoryStore CODEBASE_AGENT_CONTEXT = "codebase" @@ -15,8 +16,8 @@ HASH_TAG_PREFIX = "hash:" # Bounded-candidate multiplier: we take at most ``max_files * CANDIDATE_MULTIPLIER`` -# paths from ``rglob`` before sorting. Source: ADR-0045 §R2 — bounded streaming for -# ingestion paths. The multiplier gives the sort a meaningful candidate set while +# paths from the pruned walk before sorting. Source: ADR-0045 §R2 — bounded streaming +# for ingestion paths. The multiplier gives the sort a meaningful candidate set while # keeping peak memory O(max_files) instead of O(tree_size). CANDIDATE_MULTIPLIER = 10 @@ -90,8 +91,15 @@ def _collect_unbounded( lang_filter: set[str] | None, max_bytes: int, ) -> list[Path]: - """Walk the entire tree, filter, then sort. Memory O(filtered_count).""" - survivors = [p for p in root.rglob("*") if _file_matches(p, lang_filter, max_bytes)] + """Walk the pruned tree, filter, then sort. Memory O(filtered_count). + + Uses ``walk_pruned`` (not ``rglob``) so vendored subtrees in IGNORE_DIRS + are never descended into — a repo carrying a 154M ``deps/`` no longer + stalls the walk for minutes before the extension filter rejects it. + """ + survivors = [ + p for p in walk_pruned(root) if _file_matches(p, lang_filter, max_bytes) + ] survivors.sort() return survivors @@ -106,7 +114,7 @@ def _collect_bounded( then sort for deterministic ordering. See ADR-0045 §R2. """ candidate_cap = max(max_files * CANDIDATE_MULTIPLIER, max_files) - candidates = sorted(itertools.islice(root.rglob("*"), candidate_cap)) + candidates = sorted(itertools.islice(walk_pruned(root), candidate_cap)) files: list[Path] = [] for path in candidates: diff --git a/mcp_server/handlers/seed_project_stages.py b/mcp_server/handlers/seed_project_stages.py index 63154f97..6ddee700 100644 --- a/mcp_server/handlers/seed_project_stages.py +++ b/mcp_server/handlers/seed_project_stages.py @@ -8,7 +8,6 @@ from __future__ import annotations -import os from pathlib import Path from mcp_server.handlers.seed_project_constants import ( @@ -21,6 +20,7 @@ HEAT_BY_TYPE, IGNORE_DIRS, ) +from mcp_server.handlers.source_walk import walk_pruned def heat_for_tags(tags: list[str]) -> float: @@ -47,26 +47,10 @@ def _safe_read(path: Path, max_bytes: int = 65536) -> str: return "" -def _walk_pruned(root: Path): - """Walk ``root`` skipping IGNORE_DIRS and not following symlinks/junctions. - - Uses ``os.walk(followlinks=False)`` with in-place pruning of ``dirnames`` - so that ignored directories (node_modules, .venv, __pycache__, etc.) are - never descended into. This is the canonical cross-platform idiom and is - required on Windows to avoid traversing NTFS junctions and reparse points. - Yields ``Path`` objects for every file under the pruned tree. - """ - for dirpath, dirnames, filenames in os.walk(root, followlinks=False): - dirnames[:] = [d for d in dirnames if d not in IGNORE_DIRS] - dp = Path(dirpath) - for name in filenames: - yield dp / name - - def _detect_languages(root: Path) -> list[str]: """Detect primary programming languages from file extensions.""" ext_counts: dict[str, int] = {} - for p in _walk_pruned(root): + for p in walk_pruned(root): lang = EXT_MAP.get(p.suffix.lower()) if lang: ext_counts[lang] = ext_counts.get(lang, 0) + 1 @@ -174,7 +158,7 @@ def stage_docs(root: Path, max_bytes: int) -> list[dict]: def stage_entry_points(root: Path, max_bytes: int) -> list[dict]: """Find and read entry point files.""" discoveries = [] - for p in _walk_pruned(root): + for p in walk_pruned(root): # Skip anything inside dist-info/egg-info build metadata if any(part.endswith((".dist-info", ".egg-info")) for part in p.parts): continue diff --git a/mcp_server/handlers/source_walk.py b/mcp_server/handlers/source_walk.py new file mode 100644 index 00000000..3849eda7 --- /dev/null +++ b/mcp_server/handlers/source_walk.py @@ -0,0 +1,48 @@ +"""Canonical pruned source-tree traversal. + +A single home for the ``os.walk`` idiom that skips ignored directories +(``node_modules``, ``.venv``, ``deps``, ``site-packages``, …) by pruning +``dirnames`` in place so they are **never descended into**. + +Why this exists: ``Path.rglob("*")`` cannot prune mid-iteration — it +enumerates every entry under an ignored directory and leaves the caller to +reject them afterwards. On a repo carrying a vendored tree (a 154M ``deps/`` +of ~8K files, a ``node_modules``), that post-filter walk stalls for minutes +on the event loop. The same asymmetry caused the wiki-drift hang +(``core/wiki_drift.py``); this module is the ingestion-side counterpart. + +``os.walk(followlinks=False)`` with ``dirnames[:] = [...]`` is the canonical +cross-platform idiom and is required on Windows to avoid traversing NTFS +junctions and reparse points. +""" + +from __future__ import annotations + +import os +from collections.abc import Iterator +from pathlib import Path + +from mcp_server.handlers.seed_project_constants import IGNORE_DIRS + +__all__ = ["walk_pruned"] + + +def walk_pruned(root: Path) -> Iterator[Path]: + """Yield every file under ``root`` skipping ``IGNORE_DIRS`` subtrees. + + Preconditions: + - ``root`` is an existing directory (a non-directory yields nothing). + + Postconditions: + - No path under any directory whose name is in ``IGNORE_DIRS`` is + yielded, and such directories are never descended into — peak work + is O(files in the kept subtree), not O(whole tree). + - Symlinked directories are not followed (``followlinks=False``). + - Yields ``Path`` objects for regular directory entries only; + existence/type of each yielded path is the caller's filter concern. + """ + for dirpath, dirnames, filenames in os.walk(root, followlinks=False): + dirnames[:] = [d for d in dirnames if d not in IGNORE_DIRS] + dp = Path(dirpath) + for name in filenames: + yield dp / name diff --git a/tests_py/handlers/test_source_walk.py b/tests_py/handlers/test_source_walk.py new file mode 100644 index 00000000..112fe9fb --- /dev/null +++ b/tests_py/handlers/test_source_walk.py @@ -0,0 +1,75 @@ +"""Pruned source-tree traversal — ``walk_pruned``. + +Regression coverage for the ingestion-side counterpart of the wiki-drift +hang: a repo carrying a vendored subtree (``deps/``, ``node_modules/``, +``site-packages/``) must never be descended into. ``rglob`` enumerated those +entries before the caller could reject them; ``walk_pruned`` prunes them in +place so the work is bounded to the kept subtree. +""" + +from __future__ import annotations + +from pathlib import Path + +from mcp_server.handlers.seed_project_constants import IGNORE_DIRS +from mcp_server.handlers.source_walk import walk_pruned + + +def test_yields_kept_files(tmp_path: Path) -> None: + (tmp_path / "pkg").mkdir() + (tmp_path / "pkg" / "mod.py").write_text("x = 1\n") + (tmp_path / "top.py").write_text("y = 2\n") + + names = {p.name for p in walk_pruned(tmp_path)} + assert names == {"mod.py", "top.py"} + + +def test_does_not_descend_into_ignored_dirs(tmp_path: Path) -> None: + """A file under any IGNORE_DIRS subtree is never yielded.""" + (tmp_path / "keep.py").write_text("x = 1\n") + for d in ["deps", "node_modules", "site-packages", ".venv"]: + sub = tmp_path / d / "nested" + sub.mkdir(parents=True) + (sub / "vendored.py").write_text("import this\n") + + yielded = list(walk_pruned(tmp_path)) + assert [p.name for p in yielded] == ["keep.py"] + assert not any("vendored.py" == p.name for p in yielded) + + +def test_ignored_dir_nested_deep_is_pruned(tmp_path: Path) -> None: + """Pruning happens at the ignored dir wherever it appears in the tree.""" + deep = tmp_path / "a" / "b" / "node_modules" / "c" + deep.mkdir(parents=True) + (deep / "x.py").write_text("x = 1\n") + (tmp_path / "a" / "real.py").write_text("y = 2\n") + + names = {p.name for p in walk_pruned(tmp_path)} + assert names == {"real.py"} + + +def test_does_not_follow_symlinked_dirs(tmp_path: Path) -> None: + """``followlinks=False`` — symlinked directories are not traversed.""" + real = tmp_path / "real" + real.mkdir() + (real / "f.py").write_text("x = 1\n") + link = tmp_path / "link" + try: + link.symlink_to(real, target_is_directory=True) + except (OSError, NotImplementedError): + return # platform without symlink support — nothing to assert + + paths = list(walk_pruned(tmp_path)) + # The real file is yielded once (via ``real/``); the symlink is not + # descended into, so we never get a second ``f.py`` via ``link/``. + assert [p.name for p in paths].count("f.py") == 1 + + +def test_empty_for_missing_directory(tmp_path: Path) -> None: + assert list(walk_pruned(tmp_path / "does-not-exist")) == [] + + +def test_constants_cover_flagged_offenders() -> None: + """The vendored trees that caused the original stalls are ignored.""" + for d in ("deps", "site-packages", "node_modules", ".venv", "vendor"): + assert d in IGNORE_DIRS