Skip to content

Commit

Permalink
Merge a4f0edc into 3cf72aa
Browse files Browse the repository at this point in the history
  • Loading branch information
sevein committed Mar 8, 2019
2 parents 3cf72aa + a4f0edc commit 5855d73
Show file tree
Hide file tree
Showing 11 changed files with 1,506 additions and 62 deletions.
1,309 changes: 1,309 additions & 0 deletions fixtures/mets_dir_with_fptrs.xml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
__version__ = '0.2.4'
__version__ = '0.3.0'

__all__ = [
'AMDSec',
Expand Down
28 changes: 24 additions & 4 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,16 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
# path can validly be any encoding; if this value needs
# to be spliced later on, it's better to treat it as a
# bytestring than as actually being encoded text.
# TODO update this with six and bytes
if path:
path = str(path)
self.path = path
if six.PY2:
if isinstance(path, six.text_type):
self.path = path.encode('utf-8')
else:
self.path = path
else: # TODO: Py3 is still using Unicode.
if isinstance(path, six.binary_type):
self.path = path.decode('utf-8', errors="strict")
else:
self.path = path
if label is None and path is not None:
label = os.path.basename(path)
self.label = label
Expand All @@ -135,6 +141,20 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
self.amdsecs = []
self.dmdsecs = []

@classmethod
def dir(cls, label, children):
"""Return ``FSEntry`` directory object."""
return FSEntry(
label=label, children=children, type=u"Directory", use=None)

@classmethod
def from_fptr(cls, label, type_, fptr):
"""Return ``FSEntry`` object."""
return FSEntry(
label=label, type=type_, path=fptr.path, use=fptr.use,
file_uuid=fptr.file_uuid, derived_from=fptr.derived_from,
checksum=fptr.checksum, checksumtype=fptr.checksumtype)

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down
123 changes: 71 additions & 52 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,27 @@ def _parse_tree_structmap(self, tree, parent_elem,
continue # Only handle divs, not fptrs
entry_type = elem.get('TYPE')
label = elem.get('LABEL')
fptr = self._analyze_fptr(elem, tree, entry_type)
children = self._parse_tree_structmap(
tree, elem, normative_parent_elem=normative_elem)
fs_entry = fsentry.FSEntry(
path=fptr.path, label=label, use=fptr.use, type=entry_type,
children=children, file_uuid=fptr.file_uuid,
derived_from=fptr.derived_from, checksum=fptr.checksum,
checksumtype=fptr.checksumtype)
fptr_elems = elem.findall('mets:fptr', namespaces=utils.NAMESPACES)
# Directories are walked recursively. Additionally, they may
# contain direct fptrs.
if entry_type.lower() == "directory":
children = self._parse_tree_structmap(
tree, elem, normative_parent_elem=normative_elem)
fs_entry = fsentry.FSEntry.dir(label, children)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
siblings.append(fs_entry)
for fptr_elem in fptr_elems:
fptr = self._analyze_fptr(fptr_elem, tree, entry_type)
fs_entry = fsentry.FSEntry.from_fptr(
label=None, type_=u"Item", fptr=fptr)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
continue
# Other types, e.g.: items, aips...
if not len(fptr_elems):
continue
fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type)
fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
Expand Down Expand Up @@ -369,48 +382,49 @@ def _get_el_to_normative(parent_elem, normative_parent_elem):
return el_to_normative

@staticmethod
def _analyze_fptr(elem, tree, entry_type):
fptr = elem.find('mets:fptr', namespaces=utils.NAMESPACES)
if fptr is None:
return FPtr(*[None] * 7)
else:
file_uuid = derived_from = use = path = amdids = checksum = \
checksumtype = None
file_id = fptr.get('FILEID')
file_elem = tree.find(
'mets:fileSec//mets:file[@ID="' + file_id + '"]',
namespaces=utils.NAMESPACES)
if file_elem is None:
raise exceptions.ParseError(
'%s exists in structMap but not fileSec' % file_id)
use = file_elem.getparent().get('USE')
path = file_elem.find(
'mets:FLocat', namespaces=utils.NAMESPACES).get(
utils.lxmlns('xlink') + 'href')
try:
path = utils.urldecode(path)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(path))
amdids = file_elem.get('ADMID')
checksum = file_elem.get('CHECKSUM')
checksumtype = file_elem.get('CHECKSUMTYPE')
file_id_prefix = utils.FILE_ID_PREFIX
# If the file is an AIP, then its prefix is not "file-" but the
# name of the AIP. Therefore we need to get the extension-less
# basename of the AIP's path and remove its UUID suffix to ge
# the prefix to remove from the FILEID attribute value.
if entry_type.lower() == 'archival information package':
file_id_prefix = os.path.splitext(
os.path.basename(path))[0][:-36]
file_uuid = file_id.replace(file_id_prefix, '', 1)
group_uuid = file_elem.get('GROUPID', '').replace(
utils.GROUP_ID_PREFIX, '', 1)
if group_uuid != file_uuid:
derived_from = group_uuid # Use group_uuid as placeholder
return FPtr(file_uuid, derived_from, use, path, amdids,
checksum, checksumtype)
def _analyze_fptr(fptr_elem, tree, entry_type):
file_uuid = derived_from = use = path = amdids = checksum = \
checksumtype = None
file_id = fptr_elem.get('FILEID')
file_elem = tree.find(
'mets:fileSec//mets:file[@ID="' + file_id + '"]',
namespaces=utils.NAMESPACES)
if file_elem is None:
raise exceptions.ParseError(
'%s exists in structMap but not fileSec' % file_id)
use = file_elem.getparent().get('USE')
path = file_elem.find(
'mets:FLocat', namespaces=utils.NAMESPACES).get(
utils.lxmlns('xlink') + 'href')
try:
path = utils.urldecode(path)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(path))
amdids = file_elem.get('ADMID')
checksum = file_elem.get('CHECKSUM')
checksumtype = file_elem.get('CHECKSUMTYPE')
file_id_prefix = utils.FILE_ID_PREFIX
# If the file is an AIP, then its prefix is not "file-" but the
# name of the AIP. Therefore we need to get the extension-less
# basename of the AIP's path and remove its UUID suffix to ge
# the prefix to remove from the FILEID attribute value.
if entry_type.lower() == 'archival information package':
file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36]
# If the file is part of a directory (with no intermediate item), then
# its prefix *may not* be "file-" but the name of the file. This
# pattern is found in old Archivematica METS files, e.g. see
# ``fixtures/mets_dir_with_many_ptrs.xml``.
elif entry_type.lower() == 'directory' and file_id[:5] != "file-":
file_id_prefix = os.path.basename(path) + "-"
file_uuid = file_id.replace(file_id_prefix, '', 1)
group_uuid = file_elem.get('GROUPID', '').replace(
utils.GROUP_ID_PREFIX, '', 1)
if group_uuid != file_uuid:
derived_from = group_uuid # Use group_uuid as placeholder
return FPtr(file_uuid, derived_from, use, path, amdids,
checksum, checksumtype)

@staticmethod
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
Expand Down Expand Up @@ -445,10 +459,15 @@ def _parse_tree(self, tree=None):
tree = self.tree
# self._validate()
# Check CREATEDATE < now
createdate = self.tree.find('mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
try:
createdate = self.tree.find(
'mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
except AttributeError:
createdate = None
now = datetime.utcnow().isoformat('T')
if createdate and createdate > now:
raise exceptions.ParseError('CREATEDATE more recent than now (%s)' % now)
raise exceptions.ParseError(
'CREATEDATE more recent than now (%s)' % now)
self.createdate = createdate

# Parse structMap
Expand Down
9 changes: 9 additions & 0 deletions metsrw/plugins/premisrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
)
from .utils import (
XSI_NAMESPACE,
PREMIS_2_1_VERSION,
PREMIS_2_1_NAMESPACE,
PREMIS_2_1_XSD,
PREMIS_2_1_SCHEMA_LOCATION,
PREMIS_2_1_NAMESPACES,
PREMIS_2_1_META,
PREMIS_2_2_VERSION,
PREMIS_2_2_NAMESPACE,
PREMIS_2_2_XSD,
Expand Down Expand Up @@ -48,6 +54,9 @@
__all__ = ['PREMISElement', 'PREMISObject', 'PREMISEvent', 'PREMISAgent',
'data_to_premis', 'premis_to_data', 'data_find', 'data_find_all',
'data_find_text', 'data_find_text_or_all', 'XSI_NAMESPACE',
'PREMIS_2_1_VERSION', 'PREMIS_2_1_NAMESPACE', 'PREMIS_2_1_XSD',
'PREMIS_2_1_SCHEMA_LOCATION', 'PREMIS_2_1_NAMESPACES',
'PREMIS_2_1_META',
'PREMIS_2_2_VERSION', 'PREMIS_2_2_NAMESPACE', 'PREMIS_2_2_XSD',
'PREMIS_2_2_SCHEMA_LOCATION', 'PREMIS_2_2_NAMESPACES',
'PREMIS_2_2_META', 'PREMIS_3_0_VERSION', 'PREMIS_3_0_NAMESPACE',
Expand Down
9 changes: 6 additions & 3 deletions metsrw/plugins/premisrw/premis.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,9 +693,12 @@ def data_find_text(data, path):
if isinstance(el, (list, tuple)):
texts = [child for child in el[1:]
if not isinstance(child, (tuple, list, dict))]
if texts:
return ' '.join([str(x) for x in texts])
return None
if not texts:
return
if six.PY2:
return ' '.join(
[x.encode('utf-8', errors='ignore') for x in texts])
return ' '.join([str(x) for x in texts])


def data_find_text_or_all(data, path, dyn_cls=False):
Expand Down
19 changes: 19 additions & 0 deletions metsrw/plugins/premisrw/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@

XSI_NAMESPACE = 'http://www.w3.org/2001/XMLSchema-instance'

# PREMIS v. 2.1
PREMIS_2_1_VERSION = '2.1'
PREMIS_2_1_NAMESPACE = 'info:lc/xmlns/premis-v1'
PREMIS_2_1_XSD = 'http://www.loc.gov/standards/premis/v2/premis-v2-1.xsd'
PREMIS_2_1_SCHEMA_LOCATION = '{} {}'.format(
PREMIS_2_1_NAMESPACE, PREMIS_2_1_XSD)
PREMIS_2_1_NAMESPACES = {
'premis': PREMIS_2_1_NAMESPACE,
'xsi': XSI_NAMESPACE
}
PREMIS_2_1_META = {
'xsi:schema_location': PREMIS_2_1_SCHEMA_LOCATION,
'version': PREMIS_2_1_VERSION
}

# PREMIS v. 2.2
PREMIS_2_2_VERSION = '2.2'
PREMIS_2_2_NAMESPACE = 'info:lc/xmlns/premis-v2'
Expand Down Expand Up @@ -34,6 +49,10 @@
}

PREMIS_VERSIONS_MAP = {
PREMIS_2_1_VERSION: {
'namespaces': PREMIS_2_2_NAMESPACES,
'meta': PREMIS_2_1_META
},
PREMIS_2_2_VERSION: {
'namespaces': PREMIS_2_2_NAMESPACES,
'meta': PREMIS_2_2_META
Expand Down
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-r base.txt
mock
pytest
pytest-cov
sphinx>=1.3
Expand Down
11 changes: 11 additions & 0 deletions tests/test_fsentry.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
# -*- coding: utf-8 -*-

import pytest
from unittest import TestCase
import uuid
import six

import metsrw


class TestFSEntry(TestCase):
""" Test FSEntry class. """

@pytest.mark.skipif(six.PY3, reason="metsrw still uses Unicode in python3")
def test_path_is_binary(self):
"""It should store the ``path`` as a bytestring."""
sample = u'💜🎑💜'
assert isinstance(metsrw.FSEntry(
sample, type='Directory').path, six.binary_type)
assert isinstance(metsrw.FSEntry(
sample.encode('utf-8'), type='Directory').path, six.binary_type)

def test_create_invalid_checksum_type(self):
""" It should only accept METS valid checksum types. """
metsrw.FSEntry(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def test_replacement_techmd(self):
techmd_old = metsrw.SubSection('techMD', self.STUB_MDWRAP)
techmd_new = metsrw.SubSection('techMD', self.STUB_MDWRAP)
techmd_old.replace_with(techmd_new)
assert techmd_old.get_status() is 'superseded'
assert techmd_new.get_status() is 'current'
assert techmd_old.get_status() == 'superseded'
assert techmd_new.get_status() == 'current'

def test_replacement_sourcemd(self):
""" It should have no special behaviour replacing sourceMDs. """
Expand Down
53 changes: 53 additions & 0 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from lxml import etree
from lxml.builder import ElementMaker
import os
import mock
import pytest
from unittest import TestCase
import uuid
Expand Down Expand Up @@ -158,6 +159,46 @@ def test_fromfile_invalid_xlink_href(self):
metsrw.METSDocument.fromfile(
'fixtures/mets_invalid_xlink_hrefs.xml')

def test_analyze_fptr(self):
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse('fixtures/mets_dir_with_fptrs.xml', parser=parser)
mw = metsrw.METSDocument()

# Test that exception is raised when fileSec cannot be found.
fptr_elem = etree.fromstring('<fptr FILEID="12345"/>')
with pytest.raises(metsrw.exceptions.ParseError,
match='12345 exists in structMap but not fileSec'):
metsrw.METSDocument._analyze_fptr(fptr_elem, tree, 'directory')

# Test that exception is raised when the path cannot be decoded.
fptr_elem = etree.fromstring(
'<fptr FILEID="AM68.csv-fc0e52ca-a688-41c0-a10b-c1d36e21e804"/>')
with mock.patch('metsrw.utils.urldecode') as urldecode:
urldecode.side_effect = ValueError()
with pytest.raises(metsrw.exceptions.ParseError,
match='is not a valid URL'):
metsrw.METSDocument._analyze_fptr(fptr_elem, tree, 'directory')

# Test the integrity of the ``FPtr`` object returned.
fptr = mw._analyze_fptr(fptr_elem, tree, 'directory')
assert fptr == metsrw.mets.FPtr(
file_uuid='fc0e52ca-a688-41c0-a10b-c1d36e21e804',
derived_from=None, use='original', path='objects/AM68.csv',
amdids='amdSec_3', checksum=None, checksumtype=None)

def test_analyze_fptr_from_aip(self):
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(
'fixtures/production-pointer-file.xml', parser=parser)
mw = metsrw.METSDocument()

fptr_elem = tree.find(
'//mets:fptr[1]', namespaces=metsrw.utils.NAMESPACES)
fptr = mw._analyze_fptr(
fptr_elem, tree, 'Archival Information Package')
assert fptr.file_uuid == '7327b00f-d83a-4ae8-bb89-84fce994e827'
assert fptr.use == 'Archival Information Package'


class TestWholeMETS(TestCase):
""" Test integration between classes. """
Expand Down Expand Up @@ -587,6 +628,18 @@ def test_parse_production_pointer_file(self):
aip_uuid = '7327b00f-d83a-4ae8-bb89-84fce994e827'
assert mw.get_file(file_uuid=aip_uuid)

def test_parse_dir_with_fptrs(self):
mets_path = 'fixtures/mets_dir_with_fptrs.xml'
mw = metsrw.METSDocument.fromfile(mets_path)
assert len(mw.all_files()) == 5
assert mw.get_file(type='Directory', label='objects')
for item in (
['3a6a182a-40a0-4c2b-9752-fc7e91ac1edf', 'objects/V00154.MPG'],
['431913ba-4379-4373-8798-cc5f2b9dd769', 'objects/V00158.MPG'],
['fc0e52ca-a688-41c0-a10b-c1d36e21e804', 'objects/AM68.csv'],
):
assert mw.get_file(type='Item', file_uuid=item[0], path=item[1])

# Helper methods

def assert_mets_valid(self, mets_doc, schematron=metsrw.AM_SCT_PATH):
Expand Down

0 comments on commit 5855d73

Please sign in to comment.