Skip to content

Commit

Permalink
ARROW-8286: [Python] Ensure to create FileSystemDataset when passing …
Browse files Browse the repository at this point in the history
…pathlib path

Closes #6783 from jorisvandenbossche/ARROW-8286

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
jorisvandenbossche authored and wesm committed Mar 31, 2020
1 parent ef5f17a commit 5408f35
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 15 deletions.
7 changes: 4 additions & 3 deletions python/pyarrow/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,14 @@ def dataset(paths_or_factories, filesystem=None, partitioning=None,
kwargs = dict(filesystem=filesystem, partitioning=partitioning,
format=format)

if isinstance(paths_or_factories, str):
return factory(paths_or_factories, **kwargs).finish()

single_dataset = False
if not isinstance(paths_or_factories, list):
paths_or_factories = [paths_or_factories]
single_dataset = True

factories = [_ensure_factory(f, **kwargs) for f in paths_or_factories]
if single_dataset:
return factories[0].finish()
return UnionDatasetFactory(factories).finish()


Expand Down
39 changes: 27 additions & 12 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,34 +729,49 @@ def _create_directory_of_files(base_dir):
return (table1, table2), (path1, path2)


def _check_dataset(dataset, table):
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table)


def _check_dataset_from_path(path, table, **kwargs):
import pathlib

# pathlib object
assert isinstance(path, pathlib.Path)
dataset = ds.dataset(ds.factory(path, **kwargs))
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table)
assert isinstance(dataset, ds.FileSystemDataset)
_check_dataset(dataset, table)

# string path
dataset = ds.dataset(ds.factory(str(path), **kwargs))
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table)
assert isinstance(dataset, ds.FileSystemDataset)
_check_dataset(dataset, table)

# relative string path
with change_cwd(path.parent):
dataset = ds.dataset(ds.factory(path.name, **kwargs))
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table)
assert isinstance(dataset, ds.FileSystemDataset)
_check_dataset(dataset, table)

# passing directly to dataset
dataset = ds.dataset(path, **kwargs)
assert isinstance(dataset, ds.FileSystemDataset)
_check_dataset(dataset, table)

dataset = ds.dataset(str(path), **kwargs)
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table)
assert isinstance(dataset, ds.FileSystemDataset)
_check_dataset(dataset, table)

# passing list of files (even of length-1) gives UnionDataset
dataset = ds.dataset([path], **kwargs)
assert isinstance(dataset, ds.UnionDataset)
_check_dataset(dataset, table)

dataset = ds.dataset([str(path)], **kwargs)
assert isinstance(dataset, ds.UnionDataset)
_check_dataset(dataset, table)


@pytest.mark.parquet
Expand Down

0 comments on commit 5408f35

Please sign in to comment.