Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-3861: [Python] ParquetDataset.read() respect specified columns and not include partition columns #7050

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/pyarrow/parquet.py
Expand Up @@ -722,6 +722,8 @@ def read(self, columns=None, use_threads=True, partitions=None,
# value as indicated. The distinct categories of the partition have
# been computed in the ParquetManifest
for i, (name, index) in enumerate(self.partition_keys):
if columns is not None and name not in columns:
continue
# The partition code is the same for all values in this piece
indices = np.full(len(table), index, dtype='i4')

Expand Down
15 changes: 15 additions & 0 deletions python/pyarrow/tests/test_parquet.py
Expand Up @@ -1697,6 +1697,21 @@ def test_create_parquet_dataset_multi_threaded(tempdir):
assert len(partitions.levels) == len(manifest.partitions.levels)


@pytest.mark.pandas
@parametrize_legacy_dataset
def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset):
# ARROW-3861 - do not include partition columns in resulting table when
# `columns` keyword was passed without those columns
fs = LocalFileSystem.get_instance()
base_path = tempdir
_partition_test_for_filesystem(fs, base_path)

dataset = pq.ParquetDataset(
base_path, use_legacy_dataset=use_legacy_dataset)
result = dataset.read(columns=["values"])
assert result.column_names == ["values"]


@pytest.mark.pandas
@parametrize_legacy_dataset
def test_equivalency(tempdir, use_legacy_dataset):
Expand Down