diff --git a/metsrw/plugins/premisrw/premis.py b/metsrw/plugins/premisrw/premis.py index 4e94418..6b9cf24 100644 --- a/metsrw/plugins/premisrw/premis.py +++ b/metsrw/plugins/premisrw/premis.py @@ -699,10 +699,13 @@ def data_find_text(data, path): if not isinstance(child, (tuple, list, dict))] if not texts: return None - if six.PY2: - return ' '.join( - [x.encode('utf-8', errors='ignore') for x in texts]) - return ' '.join([str(x) for x in texts]) + return ' '.join([ + # How should we deal with decoding errors when `x` is binary? + # For now, we're using the ``strict`` mode. Other options here: + # https://docs.python.org/3/library/functions.html#open. + six.ensure_text(x, encoding='utf-8', errors='strict') + for x in texts + ]) def data_find_text_or_all(data, path, dyn_cls=False): diff --git a/tests/plugins/premisrw/test_premis.py b/tests/plugins/premisrw/test_premis.py index 67645d4..a77b794 100644 --- a/tests/plugins/premisrw/test_premis.py +++ b/tests/plugins/premisrw/test_premis.py @@ -3,6 +3,7 @@ import pytest from lxml import etree +import six import metsrw import metsrw.plugins.premisrw as premisrw @@ -36,6 +37,38 @@ def test_roundtrip_unicode(self): assert data[2][1][1] == u'𝕑𝕣𝕖𝕀𝕖𝕣𝕧𝕒π•₯π•šπ• π•Ÿ 𝕀π•ͺ𝕀π•₯π•–π•ž' assert data[2][2][1] == u'π“Šπ“ƒπ’Ύπ’Έπ‘œπ’Ήπ‘’' + @pytest.mark.skipif(six.PY3, reason='lxml in py3 does not accept binary') + def test_roundtrip_unicode_from_binary(self): + """Test that premisrw returns unicode values in all cases.""" + lxml_el = premisrw.data_to_premis(( + 'agent', + premisrw.PREMIS_META, + ( + 'agent_identifier', + ('agent_identifier_type', b'foo'), + ('agent_identifier_value', b'bar'), + ) + )) + data = premisrw.premis_to_data(lxml_el) + assert data[2][1][1] == u'foo' + assert data[2][2][1] == u'bar' + + @pytest.mark.skipif(six.PY3, reason='lxml in py3 does not accept binary') + def test_with_invalid_binary(self): + """Test lxml's ``ValueError`` with invalid byte sequences.""" + invalid_sequence = b'\x78\x9a\xbc\xde\xf0' + data = (( + 'agent', + premisrw.PREMIS_META, + ( + 'agent_identifier', + ('agent_identifier_type', b'type'), + ('agent_identifier_value', invalid_sequence), + ) + )) + with pytest.raises(ValueError): + premisrw.data_to_premis(data) + def test_premis_event_cls_data(self): """Tests that you can pass a Python tuple as the ``data`` argument to ``PREMISEvent`` to construct an instance.