Skip to content

Commit

Permalink
ARROW-8245: [Python][Parquet] Skip hidden directories when reading pa…
Browse files Browse the repository at this point in the history
…rtitioned parquet files

Closes #6821 from caleboverman/master

Authored-by: caleboverman <coverman@fanthreesixty.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
caleboverman authored and wesm committed Apr 3, 2020
1 parent 921aa0c commit cc3a26a
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
2 changes: 1 addition & 1 deletion python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,7 @@ def _parse_hive_partition(value):

def _is_private_directory(x):
_, tail = os.path.split(x)
return tail.startswith('_') and '=' not in tail
return (tail.startswith('_') or tail.startswith('.')) and '=' not in tail


def _path_split(path, sep):
Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2337,15 +2337,16 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5):


@pytest.mark.pandas
def test_ignore_private_directories(tempdir):
@pytest.mark.parametrize('dir_prefix', ['_', '.'])
def test_ignore_private_directories(tempdir, dir_prefix):
dirpath = tempdir / guid()
dirpath.mkdir()

paths = _make_example_multifile_dataset(dirpath, nfiles=10,
file_nrows=5)

# private directory
(dirpath / '_impala_staging').mkdir()
(dirpath / '{}staging'.format(dir_prefix)).mkdir()

dataset = pq.ParquetDataset(dirpath)
assert set(map(str, paths)) == {x.path for x in dataset.pieces}
Expand Down

0 comments on commit cc3a26a

Please sign in to comment.