Skip to content

Commit

Permalink
Merge d4280dc into 40d8ad0
Browse files Browse the repository at this point in the history
  • Loading branch information
jrwdunham committed Sep 25, 2018
2 parents 40d8ad0 + d4280dc commit d1a6baa
Show file tree
Hide file tree
Showing 10 changed files with 555 additions and 22 deletions.
377 changes: 377 additions & 0 deletions fixtures/mets_invalid_xlink_hrefs.xml

Large diffs are not rendered by default.

50 changes: 40 additions & 10 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
SCHEMA_LOCATIONS,
lxmlns,
FILE_ID_PREFIX,
GROUP_ID_PREFIX
GROUP_ID_PREFIX,
urlencode,
urldecode,
)
from .validate import (
METS_XSD_PATH,
Expand Down Expand Up @@ -43,12 +45,40 @@
LOGGER.addHandler(logging.NullHandler())
__version__ = '0.2.2'

__all__ = ['MetsError', 'ParseError', 'FSEntry', 'AMDSec', 'SubSection',
'MDRef', 'MDWrap', 'METSDocument', 'NAMESPACES', 'SCHEMA_LOCATIONS',
'lxmlns', 'FILE_ID_PREFIX', 'GROUP_ID_PREFIX', 'METS_XSD_PATH',
'AM_SCT_PATH', 'AM_PNTR_SCT_PATH', 'get_schematron', 'validate',
'get_xmlschema', 'xsd_validate', 'schematron_validate',
'sct_report_string', 'xsd_error_log_string', 'report_string',
'FeatureBroker', 'set_feature_broker_to_default_state',
'feature_broker', 'Dependency', 'has_class_methods', 'has_methods',
'is_class', 'plugins', '__version__']
__all__ = [
'AMDSec',
'AM_PNTR_SCT_PATH',
'AM_SCT_PATH',
'Dependency',
'FILE_ID_PREFIX',
'FSEntry',
'FeatureBroker',
'GROUP_ID_PREFIX',
'MDRef',
'MDWrap',
'METSDocument',
'METS_XSD_PATH',
'MetsError',
'NAMESPACES',
'ParseError',
'SCHEMA_LOCATIONS',
'SubSection',
'__version__',
'feature_broker',
'get_schematron',
'get_xmlschema',
'has_class_methods',
'has_methods',
'is_class',
'lxmlns',
'plugins',
'report_string',
'schematron_validate',
'sct_report_string',
'set_feature_broker_to_default_state',
'urldecode',
'urlencode',
'validate',
'xsd_error_log_string',
'xsd_validate',
]
6 changes: 4 additions & 2 deletions metsrw/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@

class MetsError(Exception):
""" Base Exception for this module. """
pass


class ParseError(MetsError):
""" Error parsing a METS file. """
pass


class SerializeError(MetsError):
""" Error serializing a METS file. """
9 changes: 7 additions & 2 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,13 @@ def serialize_filesec(self):
if self.path:
flocat = etree.SubElement(el, utils.lxmlns('mets') + 'FLocat')
# Setting manually so order is correct
flocat.set(
utils.lxmlns('xlink') + 'href', utils.urlencode(self.path))
try:
flocat.set(
utils.lxmlns('xlink') + 'href', utils.urlencode(self.path))
except ValueError:
raise exceptions.SerializeError(
'Value "{}" (for attribute xlink:href) is not a valid'
' URL.'.format(self.path))
flocat.set('LOCTYPE', 'OTHER')
flocat.set('OTHERLOCTYPE', 'SYSTEM')
for transform_file in self.transform_files:
Expand Down
15 changes: 13 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,12 @@ def parse(cls, root):
target = root.get(utils.lxmlns('xlink') + 'href')
if not target:
raise exceptions.ParseError('mdRef must have an xlink:href.')
try:
target = utils.urldecode(target)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(target))
loctype = root.get('LOCTYPE')
if not loctype:
raise exceptions.ParseError('mdRef must have a LOCTYPE')
Expand All @@ -277,8 +283,13 @@ def serialize(self):
if self.label:
el.attrib['LABEL'] = self.label
if self.target:
el.attrib[utils.lxmlns('xlink') + 'href'] = \
utils.urlencode(self.target)
try:
el.attrib[utils.lxmlns('xlink') + 'href'] = \
utils.urlencode(self.target)
except ValueError:
raise exceptions.SerializeError(
'Value "{}" (for attribute xlink:href) is not a valid'
' URL.'.format(self.target))
el.attrib['MDTYPE'] = self.mdtype
el.attrib['LOCTYPE'] = self.loctype
if self.otherloctype:
Expand Down
6 changes: 6 additions & 0 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,12 @@ def _analyze_fptr(elem, tree, entry_type):
path = file_elem.find(
'mets:FLocat', namespaces=utils.NAMESPACES).get(
utils.lxmlns('xlink') + 'href')
try:
path = utils.urldecode(path)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(path))
amdids = file_elem.get('ADMID')
checksum = file_elem.get('CHECKSUM')
checksumtype = file_elem.get('CHECKSUMTYPE')
Expand Down
34 changes: 28 additions & 6 deletions metsrw/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# -*- coding: utf-8 -*-
from six.moves.urllib.parse import quote_plus, urlparse, urlunparse
from six.moves.urllib.parse import (
quote_plus,
unquote_plus,
urlparse,
urlunparse,
)


####################################
Expand Down Expand Up @@ -37,12 +42,29 @@ def lxmlns(arg):
# HELPERS FOR MANIPULATING URLS #
#################################

def urlencode(url):
"""Replace unsafe ASCII characters using percent encoding as per RFC3986:
https://tools.ietf.org/html/rfc3986#section-2.1.
URL_ENCODABLE_PARTS = ('path', 'params', 'query', 'fragment')


def _urlendecode(url, func):
"""Encode or decode ``url`` by applying ``func`` to all of its
URL-encodable parts.
"""
parsed = urlparse(url)
for attr in ('path', 'params', 'query', 'fragment'):
for attr in URL_ENCODABLE_PARTS:
parsed = parsed._replace(
**{attr: quote_plus(getattr(parsed, attr), safe='/')})
**{attr: func(getattr(parsed, attr))})
return urlunparse(parsed)


def urlencode(url):
"""Replace unsafe ASCII characters using percent encoding as per RFC3986:
https://tools.ietf.org/html/rfc3986#section-2.1.
"""
return _urlendecode(url, lambda val: quote_plus(val, safe='/'))


def urldecode(url):
"""Decode percent encoding introduced per RFC3986
https://tools.ietf.org/html/rfc3986#section-2.1.
"""
return _urlendecode(url, unquote_plus)
30 changes: 30 additions & 0 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,21 @@ def test_parse_no_loctype(self):
metsrw.MDRef.parse(bad)
assert 'LOCTYPE' in e.value

def test_url_encoding(self):
"""Good target values are URL-encoded when they show up in xlink:href
attributes; bad target values raise ``MetsError``.
"""
mdref = metsrw.MDRef(
'30_CFLQ_271_13-3-13_1524[1].pdf', 'PREMIS:DUMMY', 'URL')
mdreffed = mdref.serialize()
assert mdreffed.get(metsrw.lxmlns('xlink') + 'href') == (
'30_CFLQ_271_13-3-13_1524%5B1%5D.pdf')
with pytest.raises(metsrw.exceptions.SerializeError,
match='is not a valid URL.'):
mdref = metsrw.MDRef(
'http://foo[bar.com/hello[1].pdf', 'PREMIS:DUMMY', 'URL')
mdref.serialize()


class TestMDWrap(TestCase):
""" Test MDWrap class. """
Expand Down Expand Up @@ -356,3 +371,18 @@ def test_roundtrip(self):
assert elem[0].tag == '{http://www.loc.gov/METS/}xmlData'
assert len(elem[0].attrib) == 0
assert elem[0][0].tag == 'foo'

def test_url_decoding(self):
good = etree.Element(
'{http://www.loc.gov/METS/}mdRef', MDTYPE='dummy', LOCTYPE='URL')
good.set('{http://www.w3.org/1999/xlink}href',
'30_CFLQ_271_13-3-13_1524%5B1%5D.pdf')
mdref = metsrw.MDRef.parse(good)
assert mdref.target == '30_CFLQ_271_13-3-13_1524[1].pdf'
with pytest.raises(metsrw.exceptions.ParseError,
match='is not a valid URL'):
bad = etree.Element(
'{http://www.loc.gov/METS/}mdRef', MDTYPE='dummy', LOCTYPE='URL')
bad.set('{http://www.w3.org/1999/xlink}href',
'http://foo[bar.com/hello[1].pdf')
metsrw.MDRef.parse(bad)
22 changes: 22 additions & 0 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ def test_mets_header_lastmoddate(self):
assert header.attrib['LASTMODDATE'] == new_date
assert header.attrib['CREATEDATE'] < header.attrib['LASTMODDATE']

def test_fromfile_invalid_xlink_href(self):
"""Test that ``fromfile`` raises ``ParseError`` if an xlink:href value
in the source METS contains an unparseable URL.
"""
with pytest.raises(metsrw.exceptions.ParseError,
match='is not a valid URL.'):
metsrw.METSDocument.fromfile(
'fixtures/mets_invalid_xlink_hrefs.xml')


class TestWholeMETS(TestCase):
""" Test integration between classes. """
Expand Down Expand Up @@ -619,3 +628,16 @@ def test_read_method_and_sequence_behaviour(self):
[fse.path for fse in mets2[:2]] ==
[fse.path for fse in mets3[:2]] ==
[fse.path for fse in mets4[:2]])

def test_files_invalid_path(self):
"""Test that if you try to set the path of a FSEntry to something that
urllib.urlparse cannot parse and then attempt to serialize the METS,
then you will trigger a MetsError.
"""
f1 = metsrw.FSEntry('http://foo[bar.com/hello[1].pdf',
file_uuid=str(uuid.uuid4()))
mw = metsrw.METSDocument()
mw.append_file(f1)
with pytest.raises(metsrw.exceptions.SerializeError,
match='is not a valid URL.'):
mw.serialize()
28 changes: 28 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-

import pytest

import metsrw


GOOD_PATHS_SLASH_URLS = (
'30_CFLQ_271_13-3-13_1524[1].pdf',
'30/CFLQ_271_13-3-13_1524[1].pdf',
r'30\ CFLQ_271_13-3-13_1524[1].pdf',
'/foo/bar[baz/hello', # urllib.urlparse will accept this because it's a URL with just a path.
'http://foobar.com/hello[1].pdf',
)

# urllib.urlparse will choke on these and raise ValueError because of the
# unbalanced bracket in the netloc part.
BAD_URLS = (
'http://foo[bar.com/hello[1].pdf',
)


def test_url_encoding():
for url in GOOD_PATHS_SLASH_URLS:
assert url == metsrw.urldecode(metsrw.urlencode(url))
for url in BAD_URLS:
with pytest.raises(ValueError):
metsrw.urlencode(url)

0 comments on commit d1a6baa

Please sign in to comment.