diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index c1d2f484a1e18..78aca361914d0 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -153,11 +153,17 @@ def partitioning(schema=None, field_names=None, flavor=None): def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path - from pyarrow.fs import FileSystem + from pyarrow.fs import FileSystem, LocalFileSystem - if filesystem is not None: - return filesystem, path - return FileSystem.from_uri(path) + if filesystem is None: + try: + return FileSystem.from_uri(path) + except Exception: + # when path is not found (e.g. relative path), we fall back to a + # local file system + filesystem = LocalFileSystem() + + return filesystem, path def _ensure_fs_and_paths(path, filesystem=None): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 0b82df54733a2..5477080d220a5 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. +import contextlib import operator +import os import numpy as np import pytest @@ -38,6 +40,16 @@ pytestmark = pytest.mark.dataset +@contextlib.contextmanager +def change_cwd(path): + curdir = os.getcwd() + os.chdir(str(path)) + try: + yield + finally: + os.chdir(curdir) + + def _generate_data(n): import datetime import itertools @@ -601,6 +613,13 @@ def _check_dataset_from_path(path, table, **kwargs): result = dataset.to_table(use_threads=False) # deterministic row order assert result.equals(table) + # relative string path + with change_cwd(path.parent): + dataset = ds.dataset(ds.factory(path.name, **kwargs)) + assert dataset.schema.equals(table.schema) + result = dataset.to_table(use_threads=False) # deterministic row order + assert result.equals(table) + # passing directly to dataset dataset = ds.dataset(str(path), **kwargs) assert dataset.schema.equals(table.schema) @@ -642,28 +661,39 @@ def test_open_dataset_list_of_files(tempdir): def test_open_dataset_partitioned_directory(tempdir): import pyarrow.parquet as pq table = pa.table({'a': range(9), 'b': [0.] * 4 + [1.] * 5}) + + path = tempdir / "dataset" + path.mkdir() + for part in range(3): - path = tempdir / "part={}".format(part) - path.mkdir() - pq.write_table(table, path / "test.parquet") + part = path / "part={}".format(part) + part.mkdir() + pq.write_table(table, part / "test.parquet") # no partitioning specified, just read all individual files full_table = pa.concat_tables([table] * 3) - _check_dataset_from_path(tempdir, full_table) + _check_dataset_from_path(path, full_table) # specify partition scheme with discovery dataset = ds.dataset( - str(tempdir), partitioning=ds.partitioning(flavor="hive")) + str(path), partitioning=ds.partitioning(flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int32())) assert dataset.schema.equals(expected_schema) + # specify partition scheme with discovery and relative path + with change_cwd(tempdir): + dataset = ds.dataset( + "dataset/", partitioning=ds.partitioning(flavor="hive")) + expected_schema = table.schema.append(pa.field("part", pa.int32())) + assert dataset.schema.equals(expected_schema) + # specify partition scheme with string short-cut - dataset = ds.dataset(str(tempdir), partitioning="hive") + dataset = ds.dataset(str(path), partitioning="hive") assert dataset.schema.equals(expected_schema) # specify partition scheme with explicit scheme dataset = ds.dataset( - str(tempdir), + str(path), partitioning=ds.partitioning( pa.schema([("part", pa.int8())]), flavor="hive")) expected_schema = table.schema.append(pa.field("part", pa.int8())) @@ -688,6 +718,11 @@ def test_open_dataset_filesystem(tempdir): dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem()) assert dataset2.schema.equals(table.schema) + # local filesystem specified with relative path + with change_cwd(tempdir): + dataset3 = ds.dataset("test.parquet", filesystem=fs.LocalFileSystem()) + assert dataset3.schema.equals(table.schema) + # passing different filesystem with pytest.raises(FileNotFoundError): ds.dataset(str(path), filesystem=fs._MockFileSystem())