Skip to content

Commit

Permalink
ARROW-8345: [Python] Ensure feather read/write can work without panda…
Browse files Browse the repository at this point in the history
…s installed

Closes #6849 from jorisvandenbossche/ARROW-8345

Lead-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Co-authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
2 people authored and wesm committed Apr 7, 2020
1 parent e657aa6 commit 535a865
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 12 deletions.
15 changes: 9 additions & 6 deletions python/pyarrow/feather.py
Expand Up @@ -137,10 +137,11 @@ def write_feather(df, dest, compression=None, compression_level=None,
Feather file version. Version 2 is the current. Version 1 is the more
limited legacy format
"""
_check_pandas_version()
if (_pandas_api.has_sparse
and isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()
if _pandas_api.have_pandas:
_check_pandas_version()
if (_pandas_api.has_sparse
and isinstance(df, _pandas_api.pd.SparseDataFrame)):
df = df.to_dense()

if _pandas_api.is_data_frame(df):
table = Table.from_pandas(df, preserve_index=False)
Expand Down Expand Up @@ -201,7 +202,10 @@ def read_feather(source, columns=None, use_threads=True):
-------
df : pandas.DataFrame
"""
return read_table(source, columns=columns).to_pandas(use_threads=True)
_check_pandas_version()
return read_table(source, columns=columns).to_pandas(
use_threads=use_threads
)


def read_table(source, columns=None, memory_map=True):
Expand All @@ -221,7 +225,6 @@ def read_table(source, columns=None, memory_map=True):
-------
table : pyarrow.Table
"""
_check_pandas_version()
reader = ext.FeatherReader()
reader.open(source, use_memory_map=memory_map)

Expand Down
51 changes: 45 additions & 6 deletions python/pyarrow/tests/test_feather.py
Expand Up @@ -36,12 +36,6 @@
pass


# TODO(wesm): The Feather tests currently are tangled with pandas
# dependency. We should isolate the pandas-depending parts and mark those with
# pytest.mark.pandas
pytestmark = pytest.mark.pandas


def random_path(prefix='feather_'):
return tempfile.mktemp(prefix=prefix)

Expand All @@ -67,6 +61,7 @@ def teardown_module(module):
pass


@pytest.mark.pandas
def test_file_not_exist():
with pytest.raises(pa.ArrowIOError):
read_feather('test_invalid_file')
Expand Down Expand Up @@ -107,6 +102,7 @@ def f():
pytest.raises(exc, f)


@pytest.mark.pandas
def test_dataset(version):
num_values = (100, 100)
num_files = 5
Expand All @@ -126,6 +122,7 @@ def test_dataset(version):
assert_frame_equal(data, df)


@pytest.mark.pandas
def test_float_no_nulls(version):
data = {}
numpy_dtypes = ['f4', 'f8']
Expand All @@ -139,6 +136,7 @@ def test_float_no_nulls(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_read_table(version):
num_values = (100, 100)
path = random_path()
Expand All @@ -163,6 +161,7 @@ def test_read_table(version):
assert_frame_equal(table.to_pandas(), result.to_pandas())


@pytest.mark.pandas
def test_float_nulls(version):
num_values = 100

Expand Down Expand Up @@ -192,6 +191,7 @@ def test_float_nulls(version):
assert_frame_equal(result, ex_frame)


@pytest.mark.pandas
def test_integer_no_nulls(version):
data = {}

Expand All @@ -207,6 +207,7 @@ def test_integer_no_nulls(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_platform_numpy_integers(version):
data = {}

Expand All @@ -221,6 +222,7 @@ def test_platform_numpy_integers(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_integer_with_nulls(version):
# pandas requires upcast to float dtype
path = random_path()
Expand Down Expand Up @@ -251,6 +253,7 @@ def test_integer_with_nulls(version):
assert_frame_equal(result, ex_frame)


@pytest.mark.pandas
def test_boolean_no_nulls(version):
num_values = 100

Expand All @@ -260,6 +263,7 @@ def test_boolean_no_nulls(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_boolean_nulls(version):
# pandas requires upcast to object dtype
path = random_path()
Expand All @@ -283,6 +287,7 @@ def test_boolean_nulls(version):
assert_frame_equal(result, ex_frame)


@pytest.mark.pandas
def test_buffer_bounds_error(version):
# ARROW-1676
path = random_path()
Expand All @@ -300,13 +305,15 @@ def test_buffer_bounds_error(version):
_check_pandas_roundtrip(expected, version=version)


@pytest.mark.pandas
def test_boolean_object_nulls(version):
repeats = 100
arr = np.array([False, None, True] * repeats, dtype=object)
df = pd.DataFrame({'bools': arr})
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_delete_partial_file_on_error(version):
if sys.platform == 'win32':
pytest.skip('Windows hangs on to file handle for some reason')
Expand All @@ -330,6 +337,7 @@ class CustomClass:
assert not os.path.exists(path)


@pytest.mark.pandas
def test_strings(version):
repeats = 1000

Expand All @@ -353,37 +361,43 @@ def test_strings(version):
_check_pandas_roundtrip(df, expected, version=version)


@pytest.mark.pandas
def test_empty_strings(version):
df = pd.DataFrame({'strings': [''] * 10})
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_all_none(version):
df = pd.DataFrame({'all_none': [None] * 10})
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_all_null_category(version):
# ARROW-1188
df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
df = df.assign(B=df.B.astype("category"))
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_multithreaded_read(version):
data = {'c{}'.format(i): [''] * 10
for i in range(100)}
df = pd.DataFrame(data)
_check_pandas_roundtrip(df, use_threads=True, version=version)


@pytest.mark.pandas
def test_nan_as_null(version):
# Create a nan that is not numpy.nan
values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)
df = pd.DataFrame({'strings': values})
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_category(version):
repeats = 1000
values = ['foo', None, 'bar', 'qux', np.nan]
Expand All @@ -395,6 +409,7 @@ def test_category(version):
_check_pandas_roundtrip(df, expected, version=version)


@pytest.mark.pandas
def test_timestamp(version):
df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)})
df['with_tz'] = (df.naive.dt.tz_localize('utc')
Expand All @@ -403,6 +418,7 @@ def test_timestamp(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_timestamp_with_nulls(version):
df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1),
None,
Expand All @@ -412,6 +428,7 @@ def test_timestamp_with_nulls(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
@pytest.mark.xfail(reason="not supported", raises=TypeError)
def test_timedelta_with_nulls_v1():
df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
Expand All @@ -420,13 +437,15 @@ def test_timedelta_with_nulls_v1():
_check_pandas_roundtrip(df, version=1)


@pytest.mark.pandas
def test_timedelta_with_nulls():
df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
None,
pd.Timedelta('3 day')]})
_check_pandas_roundtrip(df, version=2)


@pytest.mark.pandas
def test_out_of_float64_timestamp_with_nulls(version):
df = pd.DataFrame(
{'test': pd.DatetimeIndex([1451606400000000001,
Expand All @@ -435,6 +454,7 @@ def test_out_of_float64_timestamp_with_nulls(version):
_check_pandas_roundtrip(df, version=version)


@pytest.mark.pandas
def test_non_string_columns(version):
df = pd.DataFrame({0: [1, 2, 3, 4],
1: [True, False, True, False]})
Expand All @@ -443,6 +463,7 @@ def test_non_string_columns(version):
_check_pandas_roundtrip(df, expected, version=version)


@pytest.mark.pandas
@pytest.mark.skipif(not os.path.supports_unicode_filenames,
reason='unicode filenames not supported')
def test_unicode_filename(version):
Expand All @@ -453,6 +474,7 @@ def test_unicode_filename(version):
version=version)


@pytest.mark.pandas
def test_read_columns(version):
df = pd.DataFrame({
'foo': [1, 2, 3, 4],
Expand All @@ -465,6 +487,7 @@ def test_read_columns(version):
columns=['boo', 'woo'])


@pytest.mark.pandas
def test_overwritten_file(version):
path = random_path()
TEST_FILES.append(path)
Expand All @@ -479,6 +502,7 @@ def test_overwritten_file(version):
_check_pandas_roundtrip(df, path=path, version=version)


@pytest.mark.pandas
def test_filelike_objects(version):
buf = io.BytesIO()

Expand All @@ -493,6 +517,7 @@ def test_filelike_objects(version):
assert_frame_equal(result, df)


@pytest.mark.pandas
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
def test_sparse_dataframe(version):
Expand All @@ -506,6 +531,7 @@ def test_sparse_dataframe(version):
_check_pandas_roundtrip(df, expected, version=version)


@pytest.mark.pandas
def test_duplicate_columns():

# https://github.com/wesm/feather/issues/53
Expand All @@ -515,6 +541,7 @@ def test_duplicate_columns():
_assert_error_on_write(df, ValueError)


@pytest.mark.pandas
def test_unsupported():
# https://github.com/wesm/feather/issues/240
# serializing actual python objects
Expand All @@ -531,6 +558,7 @@ class A:
_assert_error_on_write(df, TypeError)


@pytest.mark.pandas
def test_v2_set_chunksize():
df = pd.DataFrame({'A': np.arange(1000)})
table = pa.table(df)
Expand All @@ -545,6 +573,7 @@ def test_v2_set_chunksize():
assert len(ipc_file.get_batch(0)) == 250


@pytest.mark.pandas
def test_v2_compression_options():
df = pd.DataFrame({'A': np.arange(1000)})

Expand Down Expand Up @@ -597,12 +626,14 @@ def test_v1_unsupported_types():


@pytest.mark.slow
@pytest.mark.pandas
def test_large_dataframe(version):
df = pd.DataFrame({'A': np.arange(400000000)})
_check_pandas_roundtrip(df, version=version)


@pytest.mark.large_memory
@pytest.mark.pandas
def test_chunked_binary_error_message():
# ARROW-3058: As Feather does not yet support chunked columns, we at least
# make sure it's clear to the user what is going on
Expand All @@ -623,3 +654,11 @@ def test_chunked_binary_error_message():
"capacity of a Feather binary column. This restriction "
"may be lifted in the future"):
write_feather(df, io.BytesIO(), version=1)


def test_feather_without_pandas(tempdir, version):
# ARROW-8345
table = pa.table([pa.array([1, 2, 3])], names=['f0'])
write_feather(table, str(tempdir / "data.feather"), version=version)
result = read_table(str(tempdir / "data.feather"))
assert result.equals(table)

0 comments on commit 535a865

Please sign in to comment.