Skip to content

Commit

Permalink
ARROW-8057: [Python] Do not compare schema metadata in Schema.equals …
Browse files Browse the repository at this point in the history
…and Table.equals by default

It seems that in practice checking for metadata equality is the less predominant mode when comparing schemas, as evidenced by the proliferation of `check_metadata=False` in our test suite and the issue reported in ARROW-8057 after the introduction of Parquet field_id metadata in ARROW-7080. While this is an API change I don't think it will affect very many people

Closes #6569 from wesm/ARROW-8057

Authored-by: Wes McKinney <wesm+git@apache.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
wesm committed Mar 11, 2020
1 parent 018dd80 commit 3beff20
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 85 deletions.
2 changes: 1 addition & 1 deletion python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool nullable()

c_string ToString()
c_bool Equals(const CField& other)
c_bool Equals(const CField& other, c_bool check_metadata)

shared_ptr[const CKeyValueMetadata] metadata()

Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1113,15 +1113,15 @@ cdef class Table(_PandasConvertible):
except TypeError:
return NotImplemented

def equals(self, Table other, bint check_metadata=True):
def equals(self, Table other, bint check_metadata=False):
"""
Check if contents of two tables are equal
Parameters
----------
other : pyarrow.Table
check_metadata : bool, default True
Whether metadata equality should be checked as well.
check_metadata : bool, default False
Whether schema metadata equality should be checked as well.
Returns
-------
Expand Down
36 changes: 18 additions & 18 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ def test_partitioning_factory(mockfs):
("group", pa.int32()),
("key", pa.string()),
])
assert inspected_schema.equals(expected_schema, check_metadata=False)
assert inspected_schema.equals(expected_schema)

hive_partitioning_factory = ds.HivePartitioning.discover()
assert isinstance(hive_partitioning_factory, ds.PartitioningFactory)
Expand Down Expand Up @@ -592,21 +592,21 @@ def _check_dataset_from_path(path, table, **kwargs):
# pathlib object
assert isinstance(path, pathlib.Path)
dataset = ds.dataset(ds.factory(path, **kwargs))
assert dataset.schema.equals(table.schema, check_metadata=False)
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table, check_metadata=False)
assert result.equals(table)

# string path
dataset = ds.dataset(ds.factory(str(path), **kwargs))
assert dataset.schema.equals(table.schema, check_metadata=False)
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table, check_metadata=False)
assert result.equals(table)

# passing directly to dataset
dataset = ds.dataset(str(path), **kwargs)
assert dataset.schema.equals(table.schema, check_metadata=False)
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table, check_metadata=False)
assert result.equals(table)


@pytest.mark.parquet
Expand Down Expand Up @@ -634,9 +634,9 @@ def test_open_dataset_list_of_files(tempdir):
ds.dataset(ds.factory([str(path1), str(path2)]))
]
for dataset in datasets:
assert dataset.schema.equals(table.schema, check_metadata=False)
assert dataset.schema.equals(table.schema)
result = dataset.to_table(use_threads=False) # deterministic row order
assert result.equals(table, check_metadata=False)
assert result.equals(table)


@pytest.mark.skipif(sys.platform == "win32", reason="fails on windows")
Expand All @@ -657,24 +657,24 @@ def test_open_dataset_partitioned_directory(tempdir):
dataset = ds.dataset(
str(tempdir), partitioning=ds.partitioning(flavor="hive"))
expected_schema = table.schema.append(pa.field("part", pa.int32()))
assert dataset.schema.equals(expected_schema, check_metadata=False)
assert dataset.schema.equals(expected_schema)

# specify partition scheme with string short-cut
dataset = ds.dataset(str(tempdir), partitioning="hive")
assert dataset.schema.equals(expected_schema, check_metadata=False)
assert dataset.schema.equals(expected_schema)

# specify partition scheme with explicit scheme
dataset = ds.dataset(
str(tempdir),
partitioning=ds.partitioning(
pa.schema([("part", pa.int8())]), flavor="hive"))
expected_schema = table.schema.append(pa.field("part", pa.int8()))
assert dataset.schema.equals(expected_schema, check_metadata=False)
assert dataset.schema.equals(expected_schema)

result = dataset.to_table(use_threads=False)
expected = full_table.append_column(
"part", pa.array(np.repeat([0, 1, 2], 9), type=pa.int8()))
assert result.equals(expected, check_metadata=False)
assert result.equals(expected)


@pytest.mark.parquet
Expand All @@ -684,11 +684,11 @@ def test_open_dataset_filesystem(tempdir):

# filesystem inferred from path
dataset1 = ds.dataset(str(path))
assert dataset1.schema.equals(table.schema, check_metadata=False)
assert dataset1.schema.equals(table.schema)

# filesystem specified
dataset2 = ds.dataset(str(path), filesystem=fs.LocalFileSystem())
assert dataset2.schema.equals(table.schema, check_metadata=False)
assert dataset2.schema.equals(table.schema)

# passing different filesystem
with pytest.raises(FileNotFoundError):
Expand Down Expand Up @@ -764,7 +764,7 @@ def test_multiple_factories(multisourcefs):
('month', pa.int32()),
('year', pa.int32()),
])
assert assembled.schema.equals(expected_schema, check_metadata=False)
assert assembled.schema.equals(expected_schema)


def test_multiple_factories_with_selectors(multisourcefs):
Expand All @@ -777,7 +777,7 @@ def test_multiple_factories_with_selectors(multisourcefs):
('value', pa.float64()),
('color', pa.string())
])
assert dataset.schema.equals(expected_schema, check_metadata=False)
assert dataset.schema.equals(expected_schema)

# with hive partitioning for two hive sources
dataset = ds.dataset(['/hive', '/hive_color'], filesystem=multisourcefs,
Expand All @@ -790,7 +790,7 @@ def test_multiple_factories_with_selectors(multisourcefs):
('month', pa.int32()),
('year', pa.int32())
])
assert dataset.schema.equals(expected_schema, check_metadata=False)
assert dataset.schema.equals(expected_schema)


def test_ipc_format(tempdir):
Expand Down
16 changes: 8 additions & 8 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _check_pandas_roundtrip(df, expected=None, use_threads=False,
if expected_schema:
# all occurrences of _check_pandas_roundtrip passes expected_schema
# without the pandas generated key-value metadata
assert table.schema.equals(expected_schema, check_metadata=False)
assert table.schema.equals(expected_schema)

if expected is None:
expected = df
Expand Down Expand Up @@ -458,7 +458,7 @@ def test_ignore_metadata(self):
expected = (table.cast(table.schema.remove_metadata())
.to_pandas())

assert result.equals(expected)
tm.assert_frame_equal(result, expected)

def test_list_metadata(self):
df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
Expand Down Expand Up @@ -2771,8 +2771,8 @@ def test_table_from_pandas_keeps_column_order_of_dataframe():
table1 = pa.Table.from_pandas(df1, preserve_index=False)
table2 = pa.Table.from_pandas(df2, preserve_index=False)

assert table1.schema.equals(schema1, check_metadata=False)
assert table2.schema.equals(schema2, check_metadata=False)
assert table1.schema.equals(schema1)
assert table2.schema.equals(schema2)


def test_table_from_pandas_keeps_column_order_of_schema():
Expand All @@ -2795,8 +2795,8 @@ def test_table_from_pandas_keeps_column_order_of_schema():
table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)

assert table1.schema.equals(schema, check_metadata=False)
assert table1.schema.equals(table2.schema, check_metadata=False)
assert table1.schema.equals(schema)
assert table1.schema.equals(table2.schema)


def test_table_from_pandas_columns_argument_only_does_filtering():
Expand All @@ -2822,8 +2822,8 @@ def test_table_from_pandas_columns_argument_only_does_filtering():
table1 = pa.Table.from_pandas(df, columns=columns1, preserve_index=False)
table2 = pa.Table.from_pandas(df, columns=columns2, preserve_index=False)

assert table1.schema.equals(schema1, check_metadata=False)
assert table2.schema.equals(schema2, check_metadata=False)
assert table1.schema.equals(schema1)
assert table2.schema.equals(schema2)


def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
Expand Down
Loading

0 comments on commit 3beff20

Please sign in to comment.