Skip to content

Commit

Permalink
premisrw: always return unicode in text getter
Browse files Browse the repository at this point in the history
In metsrw 0.2.4 or older, premisrw was returning:

  - In Py2: binary (bytes)
  - In Py3: text (unicode)

In metsrw 0.3.0, the Py2 code path was updated to explicitly use the `utf-8`
codec since the default is `ascii` which is usually a problem during encoding.

This commit updates the same code path again so in Py2 we also return unicode
making the API more uniform.
  • Loading branch information
sevein committed Mar 15, 2019
1 parent c19fca6 commit 6cbf25f
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 4 deletions.
11 changes: 7 additions & 4 deletions metsrw/plugins/premisrw/premis.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,10 +699,13 @@ def data_find_text(data, path):
if not isinstance(child, (tuple, list, dict))]
if not texts:
return None
if six.PY2:
return ' '.join(
[x.encode('utf-8', errors='ignore') for x in texts])
return ' '.join([str(x) for x in texts])
return ' '.join([
# How should we deal with decoding errors when `x` is binary?
# For now, we're using the ``strict`` mode. Other options here:
# https://docs.python.org/3/library/functions.html#open.
six.ensure_text(x, encoding='utf-8', errors='strict')
for x in texts
])


def data_find_text_or_all(data, path, dyn_cls=False):
Expand Down
33 changes: 33 additions & 0 deletions tests/plugins/premisrw/test_premis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest
from lxml import etree
import six

import metsrw
import metsrw.plugins.premisrw as premisrw
Expand Down Expand Up @@ -36,6 +37,38 @@ def test_roundtrip_unicode(self):
assert data[2][1][1] == u'𝕡𝕣𝕖𝕤𝕖𝕣𝕧𝕒𝕥𝕚𝕠𝕟 𝕤𝕪𝕤𝕥𝕖𝕞'
assert data[2][2][1] == u'𝓊𝓃𝒾𝒸𝑜𝒹𝑒'

@pytest.mark.skipif(six.PY3, reason='lxml in py3 does not accept binary')
def test_roundtrip_unicode_from_binary(self):
"""Test that premisrw returns unicode values in all cases."""
lxml_el = premisrw.data_to_premis((
'agent',
premisrw.PREMIS_META,
(
'agent_identifier',
('agent_identifier_type', b'foo'),
('agent_identifier_value', b'bar'),
)
))
data = premisrw.premis_to_data(lxml_el)
assert data[2][1][1] == u'foo'
assert data[2][2][1] == u'bar'

@pytest.mark.skipif(six.PY3, reason='lxml in py3 does not accept binary')
def test_with_invalid_binary(self):
"""Test lxml's ``ValueError`` with invalid byte sequences."""
invalid_sequence = b'\x78\x9a\xbc\xde\xf0'
data = ((
'agent',
premisrw.PREMIS_META,
(
'agent_identifier',
('agent_identifier_type', b'type'),
('agent_identifier_value', invalid_sequence),
)
))
with pytest.raises(ValueError):
premisrw.data_to_premis(data)

def test_premis_event_cls_data(self):
"""Tests that you can pass a Python tuple as the ``data`` argument to
``PREMISEvent`` to construct an instance.
Expand Down

0 comments on commit 6cbf25f

Please sign in to comment.