diff --git a/fixtures/mets_invalid_xlink_hrefs.xml b/fixtures/mets_invalid_xlink_hrefs.xml new file mode 100644 index 0000000..d14d761 --- /dev/null +++ b/fixtures/mets_invalid_xlink_hrefs.xml @@ -0,0 +1,377 @@ + + + + + + + + Landing zone + July 27, 2009 + Description field 1 + Donated by E. Peters + + + + + + + + + Pretty marbles + 1988 + Description field 2 + Gift of Marbles Company Inc. + + + + + + + + + + + UUID + ab5c67fc-8f80-4e46-9f20-8d5ae29c43f2 + + + 0 + + sha256 + a469c730e705d757d66f53f38bb4455e89d5691a3d87fc7bc069b91fa2a50d46 + + 1361321 + + + JPEG 1.01 + 1.01 + + + PRONOM + fmt/43 + + + Stuff + + %transferDirectory%objects/Landing zone.jpg + + derivation + is source of + + UUID + b1b30981-fb00-41e2-81d9-35b1e567cf46 + + + UUID + + + + + derivation + is source of + + UUID + e284d015-cfb0-45dd-961d-512bf0f47cf6 + + + UUID + cd26ff34-5839-406f-a87e-a79f4f74a2bd + + + + + + + + + + + + UUID + d0356992-1a49-4c9a-a74d-e7caceccfd5f + + ingestion + 2014-07-23T21:48:03 + + + + + + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + 41b002d3-9cfd-41de-9bb2-799a55a214be + + message digest calculation + 2014-07-23T21:48:03 + program="python"; module="hashlib.sha256()" + + + + a469c730e705d757d66f53f38bb4455e89d5691a3d87fc7bc069b91fa2a50d46 + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + 0db78377-5ae2-4412-a0f9-149c09199f34 + + virus check + 2014-07-23T21:48:04 + program="Clam AV"; version="ClamAV 0.98.1"; virusDefinitions="19216/Wed Jul 23 13:21:25 2014 +" + + Pass + + + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + 67fd04d4-4d29-42e0-8e12-f0bfac22b15d + + name cleanup + 2014-07-23T21:48:06 + prohibited characters removed:program="sanitizeNames"; version="1.10.1f3eae402ba6a6857be5a0f7a03e3c9d02cadf98" + + + + Original name="%transferDirectory%objects/Landing zone.jpg"; cleaned up name="%transferDirectory%objects/Landing_zone.jpg" + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + 26327392-0b23-41a4-8954-39749ccc5d0b + + format identification + 2014-07-23T21:48:08 + program="Fido"; version="1" + + Positive + + fmt/43 + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + cd26ff34-5839-406f-a87e-a79f4f74a2bd + + normalization + 2014-07-23T00:00:00 + ArchivematicaFPRCommandID="a34ddc9b-c922-4bb6-8037-bbe713332175"; program="convert"; version="Version: ImageMagick 6.6.9-7 2014-03-06 Q16 http://www.imagemagick.org" + + + + + %SIPDirectory%objects/Landing_zone-fc33fc0e-40ef-4ad9-ba52-860368e8ce5a.tif + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + UUID + 30f6ce4d-ba1c-4c4c-a63a-fe2f4d06a0b3 + + fixity check + 2014-07-23T21:48:29 + program="python"; module="hashlib.sha256()" + + Pass + + a469c730e705d757d66f53f38bb4455e89d5691a3d87fc7bc069b91fa2a50d46 verified + + + + Archivematica user pk + 1 + + + preservation system + Archivematica-1.2 + + + repository code + demo + + + + + + + + + + + preservation system + Archivematica-1.2 + + Archivematica + software + + + + + + + + + + repository code + demo + + demo + organization + + + + + + + + + + Archivematica user pk + 1 + + username="demo", first_name="", last_name="" + Archivematica user + + + + + + + + + + + + + +
+
+
+ +
+
+
+
+
diff --git a/metsrw/__init__.py b/metsrw/__init__.py index 13baa10..ac5b3b8 100644 --- a/metsrw/__init__.py +++ b/metsrw/__init__.py @@ -13,7 +13,9 @@ SCHEMA_LOCATIONS, lxmlns, FILE_ID_PREFIX, - GROUP_ID_PREFIX + GROUP_ID_PREFIX, + urlencode, + urldecode, ) from .validate import ( METS_XSD_PATH, @@ -43,12 +45,40 @@ LOGGER.addHandler(logging.NullHandler()) __version__ = '0.2.2' -__all__ = ['MetsError', 'ParseError', 'FSEntry', 'AMDSec', 'SubSection', - 'MDRef', 'MDWrap', 'METSDocument', 'NAMESPACES', 'SCHEMA_LOCATIONS', - 'lxmlns', 'FILE_ID_PREFIX', 'GROUP_ID_PREFIX', 'METS_XSD_PATH', - 'AM_SCT_PATH', 'AM_PNTR_SCT_PATH', 'get_schematron', 'validate', - 'get_xmlschema', 'xsd_validate', 'schematron_validate', - 'sct_report_string', 'xsd_error_log_string', 'report_string', - 'FeatureBroker', 'set_feature_broker_to_default_state', - 'feature_broker', 'Dependency', 'has_class_methods', 'has_methods', - 'is_class', 'plugins', '__version__'] +__all__ = [ + 'AMDSec', + 'AM_PNTR_SCT_PATH', + 'AM_SCT_PATH', + 'Dependency', + 'FILE_ID_PREFIX', + 'FSEntry', + 'FeatureBroker', + 'GROUP_ID_PREFIX', + 'MDRef', + 'MDWrap', + 'METSDocument', + 'METS_XSD_PATH', + 'MetsError', + 'NAMESPACES', + 'ParseError', + 'SCHEMA_LOCATIONS', + 'SubSection', + '__version__', + 'feature_broker', + 'get_schematron', + 'get_xmlschema', + 'has_class_methods', + 'has_methods', + 'is_class', + 'lxmlns', + 'plugins', + 'report_string', + 'schematron_validate', + 'sct_report_string', + 'set_feature_broker_to_default_state', + 'urldecode', + 'urlencode', + 'validate', + 'xsd_error_log_string', + 'xsd_validate', +] diff --git a/metsrw/exceptions.py b/metsrw/exceptions.py index 626e547..0220691 100644 --- a/metsrw/exceptions.py +++ b/metsrw/exceptions.py @@ -8,9 +8,11 @@ class MetsError(Exception): """ Base Exception for this module. """ - pass class ParseError(MetsError): """ Error parsing a METS file. """ - pass + + +class SerializeError(MetsError): + """ Error serializing a METS file. """ diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py index 9c6e2a7..0103146 100644 --- a/metsrw/fsentry.py +++ b/metsrw/fsentry.py @@ -337,8 +337,13 @@ def serialize_filesec(self): if self.path: flocat = etree.SubElement(el, utils.lxmlns('mets') + 'FLocat') # Setting manually so order is correct - flocat.set( - utils.lxmlns('xlink') + 'href', utils.urlencode(self.path)) + try: + flocat.set( + utils.lxmlns('xlink') + 'href', utils.urlencode(self.path)) + except ValueError: + raise exceptions.SerializeError( + 'Value "{}" (for attribute xlink:href) is not a valid' + ' URL.'.format(self.path)) flocat.set('LOCTYPE', 'OTHER') flocat.set('OTHERLOCTYPE', 'SYSTEM') for transform_file in self.transform_files: diff --git a/metsrw/metadata.py b/metsrw/metadata.py index 98393e1..3d33167 100644 --- a/metsrw/metadata.py +++ b/metsrw/metadata.py @@ -251,6 +251,12 @@ def parse(cls, root): target = root.get(utils.lxmlns('xlink') + 'href') if not target: raise exceptions.ParseError('mdRef must have an xlink:href.') + try: + target = utils.urldecode(target) + except ValueError: + raise exceptions.ParseError( + 'Value "{}" (of attribute xlink:href) is not a valid' + ' URL.'.format(target)) loctype = root.get('LOCTYPE') if not loctype: raise exceptions.ParseError('mdRef must have a LOCTYPE') @@ -277,8 +283,13 @@ def serialize(self): if self.label: el.attrib['LABEL'] = self.label if self.target: - el.attrib[utils.lxmlns('xlink') + 'href'] = \ - utils.urlencode(self.target) + try: + el.attrib[utils.lxmlns('xlink') + 'href'] = \ + utils.urlencode(self.target) + except ValueError: + raise exceptions.SerializeError( + 'Value "{}" (for attribute xlink:href) is not a valid' + ' URL.'.format(self.target)) el.attrib['MDTYPE'] = self.mdtype el.attrib['LOCTYPE'] = self.loctype if self.otherloctype: diff --git a/metsrw/mets.py b/metsrw/mets.py index a1c0fff..a927396 100755 --- a/metsrw/mets.py +++ b/metsrw/mets.py @@ -387,6 +387,12 @@ def _analyze_fptr(elem, tree, entry_type): path = file_elem.find( 'mets:FLocat', namespaces=utils.NAMESPACES).get( utils.lxmlns('xlink') + 'href') + try: + path = utils.urldecode(path) + except ValueError: + raise exceptions.ParseError( + 'Value "{}" (of attribute xlink:href) is not a valid' + ' URL.'.format(path)) amdids = file_elem.get('ADMID') checksum = file_elem.get('CHECKSUM') checksumtype = file_elem.get('CHECKSUMTYPE') diff --git a/metsrw/utils.py b/metsrw/utils.py index d605cf2..e821d40 100644 --- a/metsrw/utils.py +++ b/metsrw/utils.py @@ -1,5 +1,10 @@ # -*- coding: utf-8 -*- -from six.moves.urllib.parse import quote_plus, urlparse, urlunparse +from six.moves.urllib.parse import ( + quote_plus, + unquote_plus, + urlparse, + urlunparse, +) #################################### @@ -37,12 +42,29 @@ def lxmlns(arg): # HELPERS FOR MANIPULATING URLS # ################################# -def urlencode(url): - """Replace unsafe ASCII characters using percent encoding as per RFC3986: - https://tools.ietf.org/html/rfc3986#section-2.1. +URL_ENCODABLE_PARTS = ('path', 'params', 'query', 'fragment') + + +def _urlendecode(url, func): + """Encode or decode ``url`` by applying ``func`` to all of its + URL-encodable parts. """ parsed = urlparse(url) - for attr in ('path', 'params', 'query', 'fragment'): + for attr in URL_ENCODABLE_PARTS: parsed = parsed._replace( - **{attr: quote_plus(getattr(parsed, attr), safe='/')}) + **{attr: func(getattr(parsed, attr))}) return urlunparse(parsed) + + +def urlencode(url): + """Replace unsafe ASCII characters using percent encoding as per RFC3986: + https://tools.ietf.org/html/rfc3986#section-2.1. + """ + return _urlendecode(url, lambda val: quote_plus(val, safe='/')) + + +def urldecode(url): + """Decode percent encoding introduced per RFC3986 + https://tools.ietf.org/html/rfc3986#section-2.1. + """ + return _urlendecode(url, unquote_plus) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b3a5b0e..2daaa5d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -279,6 +279,21 @@ def test_parse_no_loctype(self): metsrw.MDRef.parse(bad) assert 'LOCTYPE' in e.value + def test_url_encoding(self): + """Good target values are URL-encoded when they show up in xlink:href + attributes; bad target values raise ``MetsError``. + """ + mdref = metsrw.MDRef( + '30_CFLQ_271_13-3-13_1524[1].pdf', 'PREMIS:DUMMY', 'URL') + mdreffed = mdref.serialize() + assert mdreffed.get(metsrw.lxmlns('xlink') + 'href') == ( + '30_CFLQ_271_13-3-13_1524%5B1%5D.pdf') + with pytest.raises(metsrw.exceptions.SerializeError, + match='is not a valid URL.'): + mdref = metsrw.MDRef( + 'http://foo[bar.com/hello[1].pdf', 'PREMIS:DUMMY', 'URL') + mdref.serialize() + class TestMDWrap(TestCase): """ Test MDWrap class. """ @@ -356,3 +371,18 @@ def test_roundtrip(self): assert elem[0].tag == '{http://www.loc.gov/METS/}xmlData' assert len(elem[0].attrib) == 0 assert elem[0][0].tag == 'foo' + + def test_url_decoding(self): + good = etree.Element( + '{http://www.loc.gov/METS/}mdRef', MDTYPE='dummy', LOCTYPE='URL') + good.set('{http://www.w3.org/1999/xlink}href', + '30_CFLQ_271_13-3-13_1524%5B1%5D.pdf') + mdref = metsrw.MDRef.parse(good) + assert mdref.target == '30_CFLQ_271_13-3-13_1524[1].pdf' + with pytest.raises(metsrw.exceptions.ParseError, + match='is not a valid URL'): + bad = etree.Element( + '{http://www.loc.gov/METS/}mdRef', MDTYPE='dummy', LOCTYPE='URL') + bad.set('{http://www.w3.org/1999/xlink}href', + 'http://foo[bar.com/hello[1].pdf') + metsrw.MDRef.parse(bad) diff --git a/tests/test_mets.py b/tests/test_mets.py index 9299471..6acb119 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -149,6 +149,15 @@ def test_mets_header_lastmoddate(self): assert header.attrib['LASTMODDATE'] == new_date assert header.attrib['CREATEDATE'] < header.attrib['LASTMODDATE'] + def test_fromfile_invalid_xlink_href(self): + """Test that ``fromfile`` raises ``ParseError`` if an xlink:href value + in the source METS contains an unparseable URL. + """ + with pytest.raises(metsrw.exceptions.ParseError, + match='is not a valid URL.'): + metsrw.METSDocument.fromfile( + 'fixtures/mets_invalid_xlink_hrefs.xml') + class TestWholeMETS(TestCase): """ Test integration between classes. """ @@ -619,3 +628,16 @@ def test_read_method_and_sequence_behaviour(self): [fse.path for fse in mets2[:2]] == [fse.path for fse in mets3[:2]] == [fse.path for fse in mets4[:2]]) + + def test_files_invalid_path(self): + """Test that if you try to set the path of a FSEntry to something that + urllib.urlparse cannot parse and then attempt to serialize the METS, + then you will trigger a MetsError. + """ + f1 = metsrw.FSEntry('http://foo[bar.com/hello[1].pdf', + file_uuid=str(uuid.uuid4())) + mw = metsrw.METSDocument() + mw.append_file(f1) + with pytest.raises(metsrw.exceptions.SerializeError, + match='is not a valid URL.'): + mw.serialize() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..f332585 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +import pytest + +import metsrw + + +GOOD_PATHS_SLASH_URLS = ( + '30_CFLQ_271_13-3-13_1524[1].pdf', + '30/CFLQ_271_13-3-13_1524[1].pdf', + r'30\ CFLQ_271_13-3-13_1524[1].pdf', + '/foo/bar[baz/hello', # urllib.urlparse will accept this because it's a URL with just a path. + 'http://foobar.com/hello[1].pdf', +) + +# urllib.urlparse will choke on these and raise ValueError because of the +# unbalanced bracket in the netloc part. +BAD_URLS = ( + 'http://foo[bar.com/hello[1].pdf', +) + + +def test_url_encoding(): + for url in GOOD_PATHS_SLASH_URLS: + assert url == metsrw.urldecode(metsrw.urlencode(url)) + for url in BAD_URLS: + with pytest.raises(ValueError): + metsrw.urlencode(url)