Skip to content

Commit

Permalink
ARROW-8136: [Python] More robust inference of local relative path in …
Browse files Browse the repository at this point in the history
…dataset

See #6643 (comment)

Closes #6655 from jorisvandenbossche/ARROW-8136-bis

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
jorisvandenbossche authored and pitrou committed Mar 19, 2020
1 parent f50ff00 commit 11ca255
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 6 deletions.
14 changes: 14 additions & 0 deletions python/pyarrow/_fs.pyx
Expand Up @@ -42,6 +42,20 @@ cdef inline c_string _path_as_bytes(path) except *:
return tobytes(path)


def _normalize_path(FileSystem filesystem, path):
"""
Normalize path for the given filesystem.
The default implementation of this method is a no-op, but subclasses
may allow normalizing irregular path forms (such as Windows local paths).
"""
cdef c_string c_path = _path_as_bytes(path)
cdef c_string c_path_normalized

c_path_normalized = GetResultValue(filesystem.fs.NormalizePath(c_path))
return frombytes(c_path_normalized)


cdef class FileInfo:
"""FileSystem entry info"""

Expand Down
17 changes: 12 additions & 5 deletions python/pyarrow/dataset.py
Expand Up @@ -153,15 +153,22 @@ def partitioning(schema=None, field_names=None, flavor=None):

def _ensure_fs(filesystem, path):
# Validate or infer the filesystem from the path
from pyarrow.fs import FileSystem, LocalFileSystem
from pyarrow.fs import (
FileSystem, LocalFileSystem, FileType, _normalize_path)

if filesystem is None:
# first check if the file exists as a local (relative) file path
filesystem = LocalFileSystem()
try:
infos = filesystem.get_target_infos([path])[0]
except OSError:
return FileSystem.from_uri(path)
except Exception:
# when path is not found (e.g. relative path), we fall back to a
# local file system
filesystem = LocalFileSystem()

if infos.type == FileType.NonExistent:
return FileSystem.from_uri(path)

# ensure we have a proper path (eg no backslashes on Windows)
path = _normalize_path(filesystem, path)

return filesystem, path

Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/fs.py
Expand Up @@ -24,7 +24,8 @@
LocalFileSystem,
LocalFileSystemOptions,
SubTreeFileSystem,
_MockFileSystem
_MockFileSystem,
_normalize_path
)

# For backward compatibility.
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow_fs.pxd
Expand Up @@ -60,6 +60,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:

cdef cppclass CFileSystem "arrow::fs::FileSystem":
c_string type_name() const
CResult[c_string] NormalizePath(c_string path)
CResult[CFileInfo] GetTargetInfo(const c_string& path)
CResult[vector[CFileInfo]] GetTargetInfos(
const vector[c_string]& paths)
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/tests/test_dataset.py
Expand Up @@ -18,6 +18,7 @@
import contextlib
import operator
import os
import urllib

import numpy as np
import pytest
Expand Down Expand Up @@ -753,6 +754,33 @@ def test_open_dataset_from_source_additional_kwargs(multisourcefs):
ds.dataset(child, format="parquet")


@pytest.mark.parquet
@pytest.mark.s3
def test_open_dataset_from_uri_s3(minio_server):
# open dataset from non-localfs string path
from pyarrow.fs import FileSystem
import pyarrow.parquet as pq

address, access_key, secret_key = minio_server
uri = "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}" \
.format(access_key, secret_key, urllib.parse.quote(address))

fs, path = FileSystem.from_uri(uri)

fs.create_dir("mybucket")
table = pa.table({'a': [1, 2, 3]})
with fs.open_output_stream("mybucket/data.parquet") as out:
pq.write_table(table, out)

# full string URI
dataset = ds.dataset(uri, format="parquet")
assert dataset.to_table().equals(table)

# passing filesystem object
dataset = ds.dataset(path, format="parquet", filesystem=fs)
assert dataset.to_table().equals(table)


@pytest.mark.parquet
def test_filter_implicit_cast(tempdir):
# ARROW-7652
Expand Down

0 comments on commit 11ca255

Please sign in to comment.