Skip to content

Commit

Permalink
Encode XLINK HREF values using URL Encoding
Browse files Browse the repository at this point in the history
This fix resolves archivematica/Issues#187 whereby an anyURI type
was not validating when it contained values that needed to be url
encoded.

Additionally this commit specifies a UTF-8 encoding for the `.py`
files in the code-base which just seems like good practice to
follow across the Archivematica repositories.
  • Loading branch information
ross-spencer committed Sep 24, 2018
1 parent 9490d8d commit 96ce254
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 36 deletions.
1 change: 1 addition & 0 deletions metsrw/di.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""Dependency Injection logic for metsrw.
Here a global singleton feature broker is instantiated. By providing features
Expand Down
1 change: 1 addition & 0 deletions metsrw/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""
Exceptions for metsrw.
Expand Down
4 changes: 3 additions & 1 deletion metsrw/fsentry.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from itertools import chain
Expand Down Expand Up @@ -336,7 +337,8 @@ def serialize_filesec(self):
if self.path:
flocat = etree.SubElement(el, utils.lxmlns('mets') + 'FLocat')
# Setting manually so order is correct
flocat.set(utils.lxmlns('xlink') + 'href', self.path)
flocat.set(
utils.lxmlns('xlink') + 'href', utils.urlencode(self.path))
flocat.set('LOCTYPE', 'OTHER')
flocat.set('OTHERLOCTYPE', 'SYSTEM')
for transform_file in self.transform_files:
Expand Down
4 changes: 3 additions & 1 deletion metsrw/metadata.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
"""
Classes for metadata sections of the METS. Include amdSec, dmdSec, techMD, rightsMD, sourceMD, digiprovMD, mdRef and mdWrap.
"""
Expand Down Expand Up @@ -276,7 +277,8 @@ def serialize(self):
if self.label:
el.attrib['LABEL'] = self.label
if self.target:
el.attrib[utils.lxmlns('xlink') + 'href'] = self.target
el.attrib[utils.lxmlns('xlink') + 'href'] = \
utils.urlencode(self.target)
el.attrib['MDTYPE'] = self.mdtype
el.attrib['LOCTYPE'] = self.loctype
if self.otherloctype:
Expand Down
1 change: 1 addition & 0 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from builtins import object
Expand Down
26 changes: 24 additions & 2 deletions metsrw/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# LXML HELPERS
# -*- coding: utf-8 -*-

# Manage complicated imports between Python 2 and 3 here. These libraries
# overlap somewhat and so that needs managing until we switch to Py3.
try:
import urlparse
from urllib import quote_plus
except ImportError:
import urllib.parse as urlparse
from urllib.parse import quote_plus


import logging
LOGGER = logging.getLogger(__name__)


# LXML HELPERS
NAMESPACES = {
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
"mets": "http://www.loc.gov/METS/",
Expand All @@ -20,6 +35,13 @@ def lxmlns(arg):


# CONSTANTS

FILE_ID_PREFIX = 'file-'
GROUP_ID_PREFIX = 'Group-'


# URL helpers
def urlencode(url):
parsed = urlparse.urlparse(url)
for attr in ('path', 'params', 'query', 'fragment'):
parsed = parsed._replace(**{attr: quote_plus(getattr(parsed, attr), safe='/')})
return urlparse.urlunparse(parsed)
1 change: 1 addition & 0 deletions metsrw/validate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import os

from lxml import etree, isoschematron
Expand Down
76 changes: 44 additions & 32 deletions tests/test_fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,30 @@ class TestFSEntry(TestCase):

def test_create_invalid_checksum_type(self):
""" It should only accept METS valid checksum types. """
metsrw.FSEntry('file1.txt', checksumtype='Adler-32', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='CRC32', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='HAVAL', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='MD5', checksum='daa05c683a4913b268653f7a7e36a5b4')
metsrw.FSEntry('file1.txt', checksumtype='MNP', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='SHA-1', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='SHA-256', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='SHA-384', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='SHA-512', checksum='dummy')
metsrw.FSEntry('file1.txt', checksumtype='TIGER WHIRLPOOL', checksum='dummy')
metsrw.FSEntry(
'file[1].txt', checksumtype='Adler-32', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='CRC32', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='HAVAL', checksum='dummy')
metsrw.FSEntry(
'file[1].txt', checksumtype='MD5',
checksum='daa05c683a4913b268653f7a7e36a5b4')
metsrw.FSEntry('file[1].txt', checksumtype='MNP', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='SHA-1', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='SHA-256', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='SHA-384', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='SHA-512', checksum='dummy')
metsrw.FSEntry(
'file[1].txt', checksumtype='TIGER WHIRLPOOL',
checksum='dummy')
with pytest.raises(ValueError):
metsrw.FSEntry('file1.txt', checksumtype='DNE', checksum='dummy')
metsrw.FSEntry('file[1].txt', checksumtype='DNE', checksum='dummy')

def test_create_checksum_and_checksumtype(self):
with pytest.raises(ValueError):
metsrw.FSEntry('file1.txt', checksum='daa05c683a4913b268653f7a7e36a5b4')
metsrw.FSEntry(
'file[1].txt', checksum='daa05c683a4913b268653f7a7e36a5b4')
with pytest.raises(ValueError):
metsrw.FSEntry('file1.txt', checksumtype='MD5')
metsrw.FSEntry('file[1].txt', checksumtype='MD5')

def test_file_id_directory(self):
""" It should have no file ID. """
Expand Down Expand Up @@ -61,13 +67,14 @@ def test_group_id_derived(self):
""" It should return the group ID for the derived from file. """
file_uuid = str(uuid.uuid4())
f = metsrw.FSEntry('level1.txt', file_uuid=file_uuid)
derived = metsrw.FSEntry('level3.txt', file_uuid=str(uuid.uuid4()), derived_from=f)
derived = metsrw.FSEntry(
'level3.txt', file_uuid=str(uuid.uuid4()), derived_from=f)
assert derived.group_id() == 'Group-' + file_uuid
assert derived.group_id() == f.group_id()

def test_admids(self):
""" It should return 0 or 1 amdSecs. """
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
assert len(f.admids) == 0
f.add_premis_object('<premis>object</premis>')
assert len(f.admids) == 1
Expand All @@ -77,20 +84,21 @@ def test_admids(self):

def test_dmdids(self):
""" It should return a DMDID for each dmdSec. """
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
assert len(f.dmdids) == 0
f.add_dublin_core('<dc />')
assert len(f.dmdids) == 1

def test_add_metadata_to_fsentry(self):
f1 = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f1 = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
f1.add_dublin_core('<dc />')
assert f1.dmdsecs
assert len(f1.dmdsecs) == 1
assert f1.dmdsecs[0].subsection == 'dmdSec'
assert f1.dmdsecs[0].contents.mdtype == 'DC'

# Can only have 1 amdSec, so subsequent subsections are children of AMDSec
# Can only have 1 amdSec, so subsequent subsections are children of
# AMDSec
f1.add_premis_object('<premis>object</premis>')
assert f1.amdsecs
assert f1.amdsecs[0].subsections
Expand Down Expand Up @@ -118,7 +126,7 @@ def test_add_child(self):
It should handle duplicates.
"""
d = metsrw.FSEntry('dir', type='Directory')
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))

d.add_child(f)
assert f in d.children
Expand All @@ -139,7 +147,7 @@ def test_remove_child(self):
It should remove the parent from the child's parent link.
"""
d = metsrw.FSEntry('dir', type='Directory')
f1 = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f1 = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
f2 = metsrw.FSEntry('file2.txt', file_uuid=str(uuid.uuid4()))
d.add_child(f1)
d.add_child(f2)
Expand All @@ -161,7 +169,7 @@ def test_serialize_filesec_basic(self):
It should have a child mets:FLocat element with the path.
"""
f = metsrw.FSEntry(
'file1.txt',
'file[1].txt',
file_uuid=str(uuid.uuid4()),
checksumtype='MD5',
checksum='daa05c683a4913b268653f7a7e36a5b4')
Expand All @@ -175,7 +183,8 @@ def test_serialize_filesec_basic(self):
assert el[0].tag == '{http://www.loc.gov/METS/}FLocat'
assert el[0].attrib['LOCTYPE'] == 'OTHER'
assert el[0].attrib['OTHERLOCTYPE'] == 'SYSTEM'
assert el[0].attrib['{http://www.w3.org/1999/xlink}href'] == 'file1.txt'
assert el[0].attrib['{http://www.w3.org/1999/xlink}href'] == \
'file%5B1%5D.txt'

def test_serialize_filesec_metadata(self):
"""
Expand All @@ -184,7 +193,7 @@ def test_serialize_filesec_metadata(self):
It should have one ADMID.
It should have a child mets:FLocat element with the path.
"""
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
f.add_premis_object('<premis>object</premis>')
el = f.serialize_filesec()
assert el.tag == '{http://www.loc.gov/METS/}file'
Expand All @@ -194,21 +203,24 @@ def test_serialize_filesec_metadata(self):
assert el[0].tag == '{http://www.loc.gov/METS/}FLocat'
assert el[0].attrib['LOCTYPE'] == 'OTHER'
assert el[0].attrib['OTHERLOCTYPE'] == 'SYSTEM'
assert el[0].attrib['{http://www.w3.org/1999/xlink}href'] == 'file1.txt'
assert el[0].attrib['{http://www.w3.org/1999/xlink}href'] == \
'file%5B1%5D.txt'

def test_serialize_filesec_not_item(self):
"""
It should not produce a mets:file element.
"""
f = metsrw.FSEntry('file1.txt', type='Directory', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry(
'file[1].txt', type='Directory', file_uuid=str(uuid.uuid4()))
el = f.serialize_filesec()
assert el is None

def test_serialize_filesec_no_use(self):
"""
It should not produce a mets:file element.
"""
f = metsrw.FSEntry('file1.txt', use=None, file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry(
'file[1].txt', use=None, file_uuid=str(uuid.uuid4()))
el = f.serialize_filesec()
assert el is None

Expand All @@ -232,12 +244,12 @@ def test_serialize_structmap_file(self):
It should have a TYPE and LABEL.
It should have a child mets:fptr element with FILEID.
"""
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
f.add_dublin_core('<dc />')
el = f.serialize_structmap(recurse=False)
assert el.tag == '{http://www.loc.gov/METS/}div'
assert el.attrib['TYPE'] == 'Item'
assert el.attrib['LABEL'] == 'file1.txt'
assert el.attrib['LABEL'] == 'file[1].txt'
assert len(el.attrib['DMDID'].split()) == 1
assert len(el) == 1
assert el[0].tag == '{http://www.loc.gov/METS/}fptr'
Expand All @@ -250,7 +262,7 @@ def test_serialize_structmap_no_recurse(self):
It should not have children.
"""
d = metsrw.FSEntry('dir', type='Directory')
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
d.add_child(f)
el = d.serialize_structmap(recurse=False)
assert el.tag == '{http://www.loc.gov/METS/}div'
Expand All @@ -265,7 +277,7 @@ def test_serialize_structmap_recurse(self):
It should have a child mets:div with the file.
"""
d = metsrw.FSEntry('dir', type='Directory')
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
d.add_child(f)
el = d.serialize_structmap(recurse=True)
assert el.tag == '{http://www.loc.gov/METS/}div'
Expand All @@ -274,7 +286,7 @@ def test_serialize_structmap_recurse(self):
assert len(el) == 1
assert el[0].tag == '{http://www.loc.gov/METS/}div'
assert el[0].attrib['TYPE'] == 'Item'
assert el[0].attrib['LABEL'] == 'file1.txt'
assert el[0].attrib['LABEL'] == 'file[1].txt'
assert len(el[0]) == 1
assert el[0][0].tag == '{http://www.loc.gov/METS/}fptr'
assert el[0][0].attrib['FILEID'].startswith('file-')
Expand Down Expand Up @@ -306,7 +318,7 @@ def test_is_empty_dir(self):
d1a = metsrw.FSEntry('dir', type='Directory')
d2a = metsrw.FSEntry('dir', type='Directory')
d2b = metsrw.FSEntry('dir', type='Directory')
f = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4()))
f = metsrw.FSEntry('file[1].txt', file_uuid=str(uuid.uuid4()))
r.add_child(d1)
r.add_child(d2)
d1.add_child(d1a)
Expand Down

0 comments on commit 96ce254

Please sign in to comment.