diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index be207ca222274e8..b924f470e0be044 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist). future Python release, patterns with this ending will match both files and directories. Add a trailing slash to match only directories. + .. versionchanged:: 3.13 + The *pattern* parameter accepts a :term:`path-like object`. + .. method:: Path.group(*, follow_symlinks=True) Return the name of the group owning the file. :exc:`KeyError` is raised @@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist). .. versionchanged:: 3.13 The *follow_symlinks* parameter was added. + .. versionchanged:: 3.13 + The *pattern* parameter accepts a :term:`path-like object`. + .. method:: Path.rmdir() Remove this directory. The directory must be empty. diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index f14d35bb0038d01..b043aed12b3849e 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -467,6 +467,29 @@ def as_uri(self): from urllib.parse import quote_from_bytes return prefix + quote_from_bytes(os.fsencode(path)) + @property + def _pattern_stack(self): + """Stack of path components, to be used with patterns in glob().""" + parts = self._tail.copy() + pattern = self._raw_path + if self.anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + elif not parts: + raise ValueError("Unacceptable pattern: {!r}".format(pattern)) + elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep): + # GH-65238: pathlib doesn't preserve trailing slash. Add it back. + parts.append('') + elif parts[-1] == '**': + # GH-70303: '**' only matches directories. Add trailing slash. + warnings.warn( + "Pattern ending '**' will match files and directories in a " + "future Python release. Add a trailing slash to match only " + "directories and remove this warning.", + FutureWarning, 4) + parts.append('') + parts.reverse() + return parts + # Subclassing os.PathLike makes isinstance() checks slower, # which in turn makes Path construction slower. Register instead! @@ -580,7 +603,7 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _make_child_entry(self, entry, is_dir=False): + def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. path_str = entry.name if str(self) == '.' else entry.path path = self.with_segments(path_str) @@ -591,6 +614,8 @@ def _make_child_entry(self, entry, is_dir=False): return path def _make_child_relpath(self, name): + if not name: + return self path_str = str(self) tail = self._tail if tail: @@ -611,14 +636,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): kind, including directories) matching the given relative pattern. """ sys.audit("pathlib.Path.glob", self, pattern) - if pattern.endswith('**'): - # GH-70303: '**' only matches directories. Add trailing slash. - warnings.warn( - "Pattern ending '**' will match files and directories in a " - "future Python release. Add a trailing slash to match only " - "directories and remove this warning.", - FutureWarning, 2) - pattern = f'{pattern}/' + if not isinstance(pattern, PurePath): + pattern = self.with_segments(pattern) return _abc.PathBase.glob( self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) @@ -628,15 +647,9 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): this subtree. """ sys.audit("pathlib.Path.rglob", self, pattern) - if pattern.endswith('**'): - # GH-70303: '**' only matches directories. Add trailing slash. - warnings.warn( - "Pattern ending '**' will match files and directories in a " - "future Python release. Add a trailing slash to match only " - "directories and remove this warning.", - FutureWarning, 2) - pattern = f'{pattern}/' - pattern = f'**/{pattern}' + if not isinstance(pattern, PurePath): + pattern = self.with_segments(pattern) + pattern = '**' / pattern return _abc.PathBase.glob( self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 48a6c218309385a..e5eeb4afce2ea9e 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive): return re.compile(regex, flags=flags).match +def _select_special(paths, part): + """Yield special literal children of the given paths.""" + for path in paths: + yield path._make_child_relpath(part) + + def _select_children(parent_paths, dir_only, follow_symlinks, match): """Yield direct children of given paths, filtering by name and type.""" if follow_symlinks is None: @@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): except OSError: continue if match(entry.name): - yield parent_path._make_child_entry(entry, dir_only) + yield parent_path._make_child_entry(entry) def _select_recursive(parent_paths, dir_only, follow_symlinks): @@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks): for entry in entries: try: if entry.is_dir(follow_symlinks=follow_symlinks): - paths.append(path._make_child_entry(entry, dir_only)) + paths.append(path._make_child_entry(entry)) continue except OSError: pass @@ -427,6 +433,14 @@ def is_absolute(self): a drive).""" return self.pathmod.isabs(self._raw_path) + @property + def _pattern_stack(self): + """Stack of path components, to be used with patterns in glob().""" + anchor, parts = self._stack + if anchor: + raise NotImplementedError("Non-relative patterns are unsupported") + return parts + def match(self, path_pattern, *, case_sensitive=None): """ Return True if this path matches the given pattern. @@ -436,11 +450,10 @@ def match(self, path_pattern, *, case_sensitive=None): if case_sensitive is None: case_sensitive = _is_case_sensitive(self.pathmod) sep = path_pattern.pathmod.sep - pattern_str = str(path_pattern) if path_pattern.anchor: - pass + pattern_str = str(path_pattern) elif path_pattern.parts: - pattern_str = f'**{sep}{pattern_str}' + pattern_str = str('**' / path_pattern) else: raise ValueError("empty pattern") match = _compile_pattern(pattern_str, sep, case_sensitive) @@ -714,10 +727,8 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _make_child_entry(self, entry, is_dir=False): + def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. - if is_dir: - return entry.joinpath('') return entry def _make_child_relpath(self, name): @@ -727,57 +738,35 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): """Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. """ - path_pattern = self.with_segments(pattern) - if path_pattern.anchor: - raise NotImplementedError("Non-relative patterns are unsupported") - elif not path_pattern.parts: - raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - - pattern_parts = list(path_pattern.parts) - if not self.pathmod.split(pattern)[1]: - # GH-65238: pathlib doesn't preserve trailing slash. Add it back. - pattern_parts.append('') - + if not isinstance(pattern, PurePathBase): + pattern = self.with_segments(pattern) if case_sensitive is None: # TODO: evaluate case-sensitivity of each directory in _select_children(). case_sensitive = _is_case_sensitive(self.pathmod) - # If symlinks are handled consistently, and the pattern does not - # contain '..' components, then we can use a 'walk-and-match' strategy - # when expanding '**' wildcards. When a '**' wildcard is encountered, - # all following pattern parts are immediately consumed and used to - # build a `re.Pattern` object. This pattern is used to filter the - # recursive walk. As a result, pattern parts following a '**' wildcard - # do not perform any filesystem access, which can be much faster! - filter_paths = follow_symlinks is not None and '..' not in pattern_parts + stack = pattern._pattern_stack + specials = ('', '.', '..') + filter_paths = False deduplicate_paths = False sep = self.pathmod.sep paths = iter([self.joinpath('')] if self.is_dir() else []) - part_idx = 0 - while part_idx < len(pattern_parts): - part = pattern_parts[part_idx] - part_idx += 1 - if part == '': - # Trailing slash. - pass - elif part == '..': - paths = (path._make_child_relpath('..') for path in paths) + while stack: + part = stack.pop() + if part in specials: + paths = _select_special(paths, part) elif part == '**': # Consume adjacent '**' components. - while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**': - part_idx += 1 - - if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '': - dir_only = pattern_parts[-1] == '' - paths = _select_recursive(paths, dir_only, follow_symlinks) + while stack and stack[-1] == '**': + stack.pop() - # Filter out paths that don't match pattern. - prefix_len = len(str(self._make_child_relpath('_'))) - 1 - match = _compile_pattern(str(path_pattern), sep, case_sensitive) - paths = (path for path in paths if match(str(path), prefix_len)) - return paths + # Consume adjacent non-special components and enable post-walk + # regex filtering, provided we're treating symlinks consistently. + if follow_symlinks is not None: + while stack and stack[-1] not in specials: + filter_paths = True + stack.pop() - dir_only = part_idx < len(pattern_parts) + dir_only = bool(stack) paths = _select_recursive(paths, dir_only, follow_symlinks) if deduplicate_paths: # De-duplicate if we've already seen a '**' component. @@ -786,9 +775,14 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: - dir_only = part_idx < len(pattern_parts) + dir_only = bool(stack) match = _compile_pattern(part, sep, case_sensitive) paths = _select_children(paths, dir_only, follow_symlinks, match) + if filter_paths: + # Filter out paths that don't match pattern. + prefix_len = len(str(self._make_child_relpath('_'))) - 1 + match = _compile_pattern(str(pattern), sep, case_sensitive) + paths = (path for path in paths if match(str(path), prefix_len)) return paths def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -796,8 +790,10 @@ def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): directories) matching the given relative pattern, anywhere in this subtree. """ - return self.glob( - f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) + if not isinstance(pattern, PurePathBase): + pattern = self.with_segments(pattern) + pattern = '**' / pattern + return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks) def walk(self, top_down=True, on_error=None, follow_symlinks=False): """Walk the directory tree from this directory, similar to os.walk().""" diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 61d7939ad140b2c..bdbe92369639ef8 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -1818,6 +1818,13 @@ def test_walk_above_recursion_limit(self): list(base.walk()) list(base.walk(top_down=False)) + def test_glob_empty_pattern(self): + p = self.cls('') + with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'): + list(p.glob('')) + with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'): + list(p.glob('.')) + def test_glob_many_open_files(self): depth = 30 P = self.cls @@ -1860,6 +1867,22 @@ def test_glob_recursive_no_trailing_slash(self): with self.assertWarns(FutureWarning): p.rglob('*/**') + def test_glob_pathlike(self): + P = self.cls + p = P(self.base) + pattern = "dir*/file*" + expect = {p / "dirB/fileB", p / "dirC/fileC"} + self.assertEqual(expect, set(p.glob(P(pattern)))) + self.assertEqual(expect, set(p.glob(FakePath(pattern)))) + + def test_rglob_pathlike(self): + P = self.cls + p = P(self.base, "dirC") + pattern = "**/file*" + expect = {p / "fileC", p / "dirD/fileD"} + self.assertEqual(expect, set(p.rglob(P(pattern)))) + self.assertEqual(expect, set(p.rglob(FakePath(pattern)))) + @only_posix class PosixPathTest(PathTest, PurePosixPathTest): diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index f877c98b7678f49..199718a8a69c5ad 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1045,9 +1045,12 @@ def _check(glob, expected): _check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"]) def test_glob_empty_pattern(self): - p = self.cls('') - with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'): - list(p.glob('')) + def _check(glob, expected): + self.assertEqual(set(glob), { P(self.base, q) for q in expected }) + P = self.cls + p = P(self.base) + _check(p.glob(""), [""]) + _check(p.glob("."), ["."]) def test_glob_case_sensitive(self): P = self.cls diff --git a/Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst b/Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst new file mode 100644 index 000000000000000..ba19b5209e648ec --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst @@ -0,0 +1,2 @@ +Accept :term:`path-like objects ` as patterns in +:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.