apache · jorisvandenbossche · Apr 28, 2020
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
@@ -722,6 +722,8 @@ def read(self, columns=None, use_threads=True, partitions=None,
             # value as indicated. The distinct categories of the partition have
             # been computed in the ParquetManifest
             for i, (name, index) in enumerate(self.partition_keys):
+                if columns is not None and name not in columns:
+                    continue
                 # The partition code is the same for all values in this piece
                 indices = np.full(len(table), index, dtype='i4')
 

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -1697,6 +1697,21 @@ def test_create_parquet_dataset_multi_threaded(tempdir):
     assert len(partitions.levels) == len(manifest.partitions.levels)
 
 
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset):
+    # ARROW-3861 - do not include partition columns in resulting table when
+    # `columns` keyword was passed without those columns
+    fs = LocalFileSystem.get_instance()
+    base_path = tempdir
+    _partition_test_for_filesystem(fs, base_path)
+
+    dataset = pq.ParquetDataset(
+        base_path, use_legacy_dataset=use_legacy_dataset)
+    result = dataset.read(columns=["values"])
+    assert result.column_names == ["values"]
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_equivalency(tempdir, use_legacy_dataset):