Skip to content

Commit

Permalink
ARROW-8342: [Python] Continue to return dict from "metadata" properti…
Browse files Browse the repository at this point in the history
…es accessing KeyValueMetadata

This patch relegates the KeyValueMetadata wrapper to an implementation detail, so existing third party code should be unaffected, and we can decide later when and how to expose this object publicly in the future without needing to revert the ARROW-8079.

I also fixed the change related to the "pandas" metadata key. Relatedly, I changed the KeyValueMetadata ctor to raise a KeyError if a use of the mixed-argument constructor (merging a prior object with some **kwargs) would create duplicate keys

Closes #6855 from wesm/ARROW-8342

Authored-by: Wes McKinney <wesm+git@apache.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
wesm committed Apr 8, 2020
1 parent e279a7e commit 1a1047a
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 22 deletions.
6 changes: 4 additions & 2 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


import ast
from copy import deepcopy
from itertools import zip_longest
import json
import operator
Expand Down Expand Up @@ -234,7 +235,7 @@ def construct_metadata(df, column_names, index_levels, index_descriptors,
index_descriptors = index_column_metadata = column_indexes = []

return {
'pandas': json.dumps({
b'pandas': json.dumps({
'index_columns': index_descriptors,
'column_indexes': column_indexes,
'columns': column_metadata + index_column_metadata,
Expand Down Expand Up @@ -590,7 +591,8 @@ def convert_column(col, field):
pandas_metadata = construct_metadata(df, column_names, index_columns,
index_descriptors, preserve_index,
types)
metadata = pa.KeyValueMetadata(schema.metadata or {}, **pandas_metadata)
metadata = deepcopy(schema.metadata) if schema.metadata else dict()
metadata.update(pandas_metadata)
schema = schema.with_metadata(metadata)

return arrays, schema
Expand Down
24 changes: 19 additions & 5 deletions python/pyarrow/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,16 +622,15 @@ def test_key_value_metadata():
('b', 'beta'),
('a', 'Alpha'),
('a', 'ALPHA'),
], b='BETA')
])

expected = [
(b'a', b'alpha'),
(b'b', b'beta'),
(b'a', b'Alpha'),
(b'a', b'ALPHA'),
(b'b', b'BETA')
(b'a', b'ALPHA')
]
assert len(md) == 5
assert len(md) == 4
assert isinstance(md.keys(), Iterator)
assert isinstance(md.values(), Iterator)
assert isinstance(md.items(), Iterator)
Expand All @@ -643,9 +642,24 @@ def test_key_value_metadata():
assert md['a'] == b'alpha'
assert md['b'] == b'beta'
assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA']
assert md.get_all('b') == [b'beta', b'BETA']
assert md.get_all('b') == [b'beta']
assert md.get_all('unkown') == []

with pytest.raises(KeyError):
md = pa.KeyValueMetadata([
('a', 'alpha'),
('b', 'beta'),
('a', 'Alpha'),
('a', 'ALPHA'),
], b='BETA')


def test_key_value_metadata_duplicates():
meta = pa.KeyValueMetadata({'a': '1', 'b': '2'})

with pytest.raises(KeyError):
pa.KeyValueMetadata(meta, a='3')


def test_field_basic():
t = pa.string()
Expand Down
64 changes: 49 additions & 15 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -824,28 +824,32 @@ def unregister_extension_type(type_name):

cdef class KeyValueMetadata(_Metadata, Mapping):

def __init__(self, *args, **kwargs):
def __init__(self, __arg0__=None, **kwargs):
cdef:
vector[c_string] keys, values
shared_ptr[const CKeyValueMetadata] meta
shared_ptr[const CKeyValueMetadata] result

items = []
if args:
if len(args) > 1:
raise TypeError('expected at most 1 positional argument, '
'got {}'.format(len(args)))
other = args[0]
items += other.items() if isinstance(other, Mapping) else other

items += kwargs.items()
if __arg0__ is not None:
other = (__arg0__.items() if isinstance(__arg0__, Mapping)
else __arg0__)
items.extend((tobytes(k), v) for k, v in other)

prior_keys = {k for k, v in items}
for k, v in kwargs.items():
k = tobytes(k)
if k in prior_keys:
raise KeyError("Duplicate key {}, "
"use pass all items as list of tuples if you "
"intend to have duplicate keys")
items.append((k, v))

keys.reserve(len(items))
for key, value in items:
keys.push_back(tobytes(key))
values.push_back(tobytes(value))

meta.reset(new CKeyValueMetadata(keys, values))
self.init(meta)
result.reset(new CKeyValueMetadata(move(keys), move(values)))
self.init(result)

cdef void init(self, const shared_ptr[const CKeyValueMetadata]& wrapped):
self.wrapped = wrapped
Expand All @@ -863,6 +867,9 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
def equals(self, KeyValueMetadata other):
return self.metadata.Equals(deref(other.wrapped))

def __repr__(self):
return str(self)

def __str__(self):
return frombytes(self.metadata.ToString())

Expand Down Expand Up @@ -896,6 +903,12 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
def __reduce__(self):
return KeyValueMetadata, (list(self.items()),)

def key(self, i):
return self.metadata.key(i)

def value(self, i):
return self.metadata.value(i)

def keys(self):
for i in range(self.metadata.size()):
yield self.metadata.key(i)
Expand All @@ -912,6 +925,19 @@ cdef class KeyValueMetadata(_Metadata, Mapping):
key = tobytes(key)
return [v for k, v in self.items() if k == key]

def to_dict(self):
"""
Convert KeyValueMetadata to dict. If a key occurs twice, the value for
the first one is returned
"""
cdef object key # to force coercion to Python
result = ordered_dict()
for i in range(self.metadata.size()):
key = self.metadata.key(i)
if key not in result:
result[key] = self.metadata.value(i)
return result


cdef KeyValueMetadata ensure_metadata(object meta, c_bool allow_none=False):
if allow_none and meta is None:
Expand Down Expand Up @@ -986,7 +1012,11 @@ cdef class Field:

@property
def metadata(self):
return pyarrow_wrap_metadata(self.field.metadata())
wrapped = pyarrow_wrap_metadata(self.field.metadata())
if wrapped is not None:
return wrapped.to_dict()
else:
return wrapped

def add_metadata(self, metadata):
warnings.warn("The 'add_metadata' method is deprecated, use "
Expand Down Expand Up @@ -1190,7 +1220,11 @@ cdef class Schema:

@property
def metadata(self):
return pyarrow_wrap_metadata(self.schema.metadata())
wrapped = pyarrow_wrap_metadata(self.schema.metadata())
if wrapped is not None:
return wrapped.to_dict()
else:
return wrapped

def __eq__(self, other):
try:
Expand Down

0 comments on commit 1a1047a

Please sign in to comment.