Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-15870: [Python] Start to raise deprecation warnings for use_legacy_dataset=True in parquet.read_table #12584

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1908,7 +1908,8 @@ def partitioning(self):
pyarrow 1.0.0. Among other things, this allows to pass `filters`
for all columns and not only the partition keys, enables
different partitioning schemes, etc.
Set to True to use the legacy behaviour.
Set to True to use the legacy behaviour (this option is deprecated,
and the legacy implementation will be removed in a future version).
ignore_prefixes : list, optional
Files matching any of these prefixes will be ignored by the
discovery process if use_legacy_dataset=False.
Expand Down Expand Up @@ -2006,6 +2007,12 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
return dataset.read(columns=columns, use_threads=use_threads,
use_pandas_metadata=use_pandas_metadata)

warnings.warn(
"Passing 'use_legacy_dataset=True' to get the legacy behaviour is "
"deprecated as of pyarrow 8.0.0, and the legacy implementation will "
"be removed in a future version.",
DeprecationWarning, stacklevel=2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to use DeprecationWarning or FutureWarning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably directly raise a FutureWarning, but all the others here are still raising DeprecationWarnings. Will do a separate PR to change those all from deprecation to future warning.


if ignore_prefixes is not None:
raise ValueError(
"The 'ignore_prefixes' keyword is only supported when "
Expand Down
27 changes: 27 additions & 0 deletions python/pyarrow/tests/parquet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pytest

# Marks all of the tests in this module
# Ignore these with pytest ... -m 'not parquet'
pytestmark = [
pytest.mark.parquet,
pytest.mark.filterwarnings(
"ignore:Passing 'use_legacy_dataset=True':DeprecationWarning"
),
]
20 changes: 16 additions & 4 deletions python/pyarrow/tests/parquet/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@
pd = tm = None


pytestmark = pytest.mark.parquet


def test_parquet_invalid_version(tempdir):
table = pa.table({'a': [1, 2, 3]})
with pytest.raises(ValueError, match="Unsupported Parquet format version"):
Expand Down Expand Up @@ -593,7 +590,10 @@ def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
pq.read_table(datadir / 'v0.7.1.parquet',
use_legacy_dataset=use_legacy_dataset)

assert len(record) == 0
if use_legacy_dataset:
assert len(record) == 1
else:
assert len(record) == 0


@pytest.mark.pandas
Expand Down Expand Up @@ -758,3 +758,15 @@ def test_permutation_of_column_order(tempdir):
names=['a', 'b'])

assert table == table2


def test_read_table_legacy_deprecated(tempdir):
# ARROW-15870
table = pa.table({'a': [1, 2, 3]})
path = tempdir / 'data.parquet'
pq.write_table(table, path)

with pytest.warns(
DeprecationWarning, match="Passing 'use_legacy_dataset=True'"
):
pq.read_table(path, use_legacy_dataset=True)
1 change: 0 additions & 1 deletion python/pyarrow/tests/parquet/test_compliant_nested_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
except ImportError:
pd = tm = None

pytestmark = pytest.mark.parquet

# Tests for ARROW-11497
_test_data_simple = [
Expand Down
3 changes: 0 additions & 3 deletions python/pyarrow/tests/parquet/test_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@
pd = tm = None


pytestmark = pytest.mark.parquet


# General roundtrip of data types
# -----------------------------------------------------------------------------

Expand Down
2 changes: 0 additions & 2 deletions python/pyarrow/tests/parquet/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@
except ImportError:
pd = tm = None

pytestmark = pytest.mark.parquet


@pytest.mark.pandas
def test_parquet_piece_read(tempdir):
Expand Down
3 changes: 0 additions & 3 deletions python/pyarrow/tests/parquet/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@
pd = tm = None


pytestmark = pytest.mark.parquet


@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_datetime_tz(use_legacy_dataset):
Expand Down
3 changes: 0 additions & 3 deletions python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@
pd = tm = None


pytestmark = pytest.mark.parquet


@pytest.mark.pandas
def test_parquet_metadata_api():
df = alltypes_sample(size=10000)
Expand Down
3 changes: 0 additions & 3 deletions python/pyarrow/tests/parquet/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@
pd = tm = None


pytestmark = pytest.mark.parquet


@pytest.mark.pandas
def test_pandas_parquet_custom_metadata(tempdir):
df = alltypes_sample(size=10000)
Expand Down
2 changes: 0 additions & 2 deletions python/pyarrow/tests/parquet/test_parquet_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@
except ImportError:
pd = tm = None

pytestmark = pytest.mark.parquet


@pytest.mark.pandas
def test_pass_separate_metadata():
Expand Down
2 changes: 0 additions & 2 deletions python/pyarrow/tests/parquet/test_parquet_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@
except ImportError:
pd = tm = None

pytestmark = pytest.mark.parquet


@pytest.mark.pandas
@parametrize_legacy_dataset
Expand Down