Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle old Archivematica METS documents #53

Merged
merged 8 commits into from
Mar 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
sudo: false
language: python

branches:
only:
- "master"

install:
- pip install tox coveralls

Expand Down
1,309 changes: 1,309 additions & 0 deletions fixtures/mets_dir_with_fptrs.xml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
__version__ = '0.2.4'
__version__ = '0.3.0'

__all__ = [
'AMDSec',
Expand Down
28 changes: 24 additions & 4 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,16 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
# path can validly be any encoding; if this value needs
# to be spliced later on, it's better to treat it as a
# bytestring than as actually being encoded text.
# TODO update this with six and bytes
if path:
path = str(path)
self.path = path
if six.PY2:
if isinstance(path, six.text_type):
self.path = path.encode('utf-8')
else:
self.path = path
else: # TODO: Py3 is still using Unicode.
if isinstance(path, six.binary_type):
self.path = path.decode('utf-8', errors="strict")
else:
self.path = path
if label is None and path is not None:
label = os.path.basename(path)
self.label = label
Expand All @@ -135,6 +141,20 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
self.amdsecs = []
self.dmdsecs = []

@classmethod
def dir(cls, label, children):
"""Return ``FSEntry`` directory object."""
return FSEntry(
label=label, children=children, type=u"Directory", use=None)

@classmethod
def from_fptr(cls, label, type_, fptr):
"""Return ``FSEntry`` object."""
return FSEntry(
label=label, type=type_, path=fptr.path, use=fptr.use,
file_uuid=fptr.file_uuid, derived_from=fptr.derived_from,
checksum=fptr.checksum, checksumtype=fptr.checksumtype)

def __str__(self):
return '{s.type}: {s.path}'.format(s=self)

Expand Down
123 changes: 71 additions & 52 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,27 @@ def _parse_tree_structmap(self, tree, parent_elem,
continue # Only handle divs, not fptrs
entry_type = elem.get('TYPE')
label = elem.get('LABEL')
fptr = self._analyze_fptr(elem, tree, entry_type)
children = self._parse_tree_structmap(
tree, elem, normative_parent_elem=normative_elem)
fs_entry = fsentry.FSEntry(
path=fptr.path, label=label, use=fptr.use, type=entry_type,
children=children, file_uuid=fptr.file_uuid,
derived_from=fptr.derived_from, checksum=fptr.checksum,
checksumtype=fptr.checksumtype)
fptr_elems = elem.findall('mets:fptr', namespaces=utils.NAMESPACES)
# Directories are walked recursively. Additionally, they may
# contain direct fptrs.
if entry_type.lower() == "directory":
children = self._parse_tree_structmap(
tree, elem, normative_parent_elem=normative_elem)
fs_entry = fsentry.FSEntry.dir(label, children)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
siblings.append(fs_entry)
for fptr_elem in fptr_elems:
fptr = self._analyze_fptr(fptr_elem, tree, entry_type)
fs_entry = fsentry.FSEntry.from_fptr(
label=None, type_=u"Item", fptr=fptr)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
continue
# Other types, e.g.: items, aips...
if not len(fptr_elems):
continue
fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type)
fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
Expand Down Expand Up @@ -369,48 +382,49 @@ def _get_el_to_normative(parent_elem, normative_parent_elem):
return el_to_normative

@staticmethod
def _analyze_fptr(elem, tree, entry_type):
fptr = elem.find('mets:fptr', namespaces=utils.NAMESPACES)
if fptr is None:
return FPtr(*[None] * 7)
else:
file_uuid = derived_from = use = path = amdids = checksum = \
checksumtype = None
file_id = fptr.get('FILEID')
file_elem = tree.find(
'mets:fileSec//mets:file[@ID="' + file_id + '"]',
namespaces=utils.NAMESPACES)
if file_elem is None:
raise exceptions.ParseError(
'%s exists in structMap but not fileSec' % file_id)
use = file_elem.getparent().get('USE')
path = file_elem.find(
'mets:FLocat', namespaces=utils.NAMESPACES).get(
utils.lxmlns('xlink') + 'href')
try:
path = utils.urldecode(path)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(path))
amdids = file_elem.get('ADMID')
checksum = file_elem.get('CHECKSUM')
checksumtype = file_elem.get('CHECKSUMTYPE')
file_id_prefix = utils.FILE_ID_PREFIX
# If the file is an AIP, then its prefix is not "file-" but the
# name of the AIP. Therefore we need to get the extension-less
# basename of the AIP's path and remove its UUID suffix to ge
# the prefix to remove from the FILEID attribute value.
if entry_type.lower() == 'archival information package':
file_id_prefix = os.path.splitext(
os.path.basename(path))[0][:-36]
file_uuid = file_id.replace(file_id_prefix, '', 1)
group_uuid = file_elem.get('GROUPID', '').replace(
utils.GROUP_ID_PREFIX, '', 1)
if group_uuid != file_uuid:
derived_from = group_uuid # Use group_uuid as placeholder
return FPtr(file_uuid, derived_from, use, path, amdids,
checksum, checksumtype)
def _analyze_fptr(fptr_elem, tree, entry_type):
file_uuid = derived_from = use = path = amdids = checksum = \
checksumtype = None
file_id = fptr_elem.get('FILEID')
file_elem = tree.find(
'mets:fileSec//mets:file[@ID="' + file_id + '"]',
namespaces=utils.NAMESPACES)
if file_elem is None:
raise exceptions.ParseError(
'%s exists in structMap but not fileSec' % file_id)
use = file_elem.getparent().get('USE')
path = file_elem.find(
'mets:FLocat', namespaces=utils.NAMESPACES).get(
utils.lxmlns('xlink') + 'href')
try:
path = utils.urldecode(path)
except ValueError:
raise exceptions.ParseError(
'Value "{}" (of attribute xlink:href) is not a valid'
' URL.'.format(path))
amdids = file_elem.get('ADMID')
checksum = file_elem.get('CHECKSUM')
checksumtype = file_elem.get('CHECKSUMTYPE')
file_id_prefix = utils.FILE_ID_PREFIX
# If the file is an AIP, then its prefix is not "file-" but the
sevein marked this conversation as resolved.
Show resolved Hide resolved
# name of the AIP. Therefore we need to get the extension-less
# basename of the AIP's path and remove its UUID suffix to ge
# the prefix to remove from the FILEID attribute value.
if entry_type.lower() == 'archival information package':
file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36]
# If the file is part of a directory (with no intermediate item), then
# its prefix *may not* be "file-" but the name of the file. This
# pattern is found in old Archivematica METS files, e.g. see
# ``fixtures/mets_dir_with_many_ptrs.xml``.
elif entry_type.lower() == 'directory' and file_id[:5] != "file-":
file_id_prefix = os.path.basename(path) + "-"
file_uuid = file_id.replace(file_id_prefix, '', 1)
group_uuid = file_elem.get('GROUPID', '').replace(
utils.GROUP_ID_PREFIX, '', 1)
if group_uuid != file_uuid:
derived_from = group_uuid # Use group_uuid as placeholder
return FPtr(file_uuid, derived_from, use, path, amdids,
checksum, checksumtype)

@staticmethod
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
Expand Down Expand Up @@ -445,10 +459,15 @@ def _parse_tree(self, tree=None):
tree = self.tree
# self._validate()
# Check CREATEDATE < now
createdate = self.tree.find('mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
try:
createdate = self.tree.find(
'mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
except AttributeError:
createdate = None
now = datetime.utcnow().isoformat('T')
if createdate and createdate > now:
raise exceptions.ParseError('CREATEDATE more recent than now (%s)' % now)
raise exceptions.ParseError(
'CREATEDATE more recent than now (%s)' % now)
self.createdate = createdate

# Parse structMap
Expand Down
9 changes: 9 additions & 0 deletions metsrw/plugins/premisrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
)
from .utils import (
XSI_NAMESPACE,
PREMIS_2_1_VERSION,
PREMIS_2_1_NAMESPACE,
PREMIS_2_1_XSD,
PREMIS_2_1_SCHEMA_LOCATION,
PREMIS_2_1_NAMESPACES,
PREMIS_2_1_META,
PREMIS_2_2_VERSION,
PREMIS_2_2_NAMESPACE,
PREMIS_2_2_XSD,
Expand Down Expand Up @@ -48,6 +54,9 @@
__all__ = ['PREMISElement', 'PREMISObject', 'PREMISEvent', 'PREMISAgent',
'data_to_premis', 'premis_to_data', 'data_find', 'data_find_all',
'data_find_text', 'data_find_text_or_all', 'XSI_NAMESPACE',
'PREMIS_2_1_VERSION', 'PREMIS_2_1_NAMESPACE', 'PREMIS_2_1_XSD',
'PREMIS_2_1_SCHEMA_LOCATION', 'PREMIS_2_1_NAMESPACES',
'PREMIS_2_1_META',
'PREMIS_2_2_VERSION', 'PREMIS_2_2_NAMESPACE', 'PREMIS_2_2_XSD',
'PREMIS_2_2_SCHEMA_LOCATION', 'PREMIS_2_2_NAMESPACES',
'PREMIS_2_2_META', 'PREMIS_3_0_VERSION', 'PREMIS_3_0_NAMESPACE',
Expand Down
16 changes: 10 additions & 6 deletions metsrw/plugins/premisrw/premis.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,12 +690,16 @@ def data_find_text(data, path):
simplified XPath ``path``.
"""
el = data_find(data, path)
if isinstance(el, (list, tuple)):
texts = [child for child in el[1:]
if not isinstance(child, (tuple, list, dict))]
if texts:
return ' '.join([str(x) for x in texts])
return None
if not isinstance(el, (list, tuple)):
return None
texts = [child for child in el[1:]
if not isinstance(child, (tuple, list, dict))]
if not texts:
return None
if six.PY2:
return ' '.join(
[x.encode('utf-8', errors='ignore') for x in texts])
return ' '.join([str(x) for x in texts])


def data_find_text_or_all(data, path, dyn_cls=False):
Expand Down
19 changes: 19 additions & 0 deletions metsrw/plugins/premisrw/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@

XSI_NAMESPACE = 'http://www.w3.org/2001/XMLSchema-instance'

# PREMIS v. 2.1
PREMIS_2_1_VERSION = '2.1'
PREMIS_2_1_NAMESPACE = 'info:lc/xmlns/premis-v1'
PREMIS_2_1_XSD = 'http://www.loc.gov/standards/premis/v2/premis-v2-1.xsd'
PREMIS_2_1_SCHEMA_LOCATION = '{} {}'.format(
PREMIS_2_1_NAMESPACE, PREMIS_2_1_XSD)
PREMIS_2_1_NAMESPACES = {
'premis': PREMIS_2_1_NAMESPACE,
'xsi': XSI_NAMESPACE
}
PREMIS_2_1_META = {
'xsi:schema_location': PREMIS_2_1_SCHEMA_LOCATION,
'version': PREMIS_2_1_VERSION
}

# PREMIS v. 2.2
PREMIS_2_2_VERSION = '2.2'
PREMIS_2_2_NAMESPACE = 'info:lc/xmlns/premis-v2'
Expand Down Expand Up @@ -34,6 +49,10 @@
}

PREMIS_VERSIONS_MAP = {
PREMIS_2_1_VERSION: {
'namespaces': PREMIS_2_2_NAMESPACES,
'meta': PREMIS_2_1_META
},
PREMIS_2_2_VERSION: {
'namespaces': PREMIS_2_2_NAMESPACES,
'meta': PREMIS_2_2_META
Expand Down
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-r base.txt
mock
pytest
pytest-cov
sphinx>=1.3
Expand Down
11 changes: 11 additions & 0 deletions tests/test_fsentry.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
# -*- coding: utf-8 -*-

import pytest
from unittest import TestCase
import uuid
import six

import metsrw


class TestFSEntry(TestCase):
""" Test FSEntry class. """

@pytest.mark.skipif(six.PY3, reason="metsrw still uses Unicode in python3")
def test_path_is_binary(self):
"""It should store the ``path`` as a bytestring."""
sample = u'💜🎑💜'
assert isinstance(metsrw.FSEntry(
sample, type='Directory').path, six.binary_type)
assert isinstance(metsrw.FSEntry(
sample.encode('utf-8'), type='Directory').path, six.binary_type)

def test_create_invalid_checksum_type(self):
""" It should only accept METS valid checksum types. """
metsrw.FSEntry(
Expand Down
4 changes: 2 additions & 2 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def test_replacement_techmd(self):
techmd_old = metsrw.SubSection('techMD', self.STUB_MDWRAP)
techmd_new = metsrw.SubSection('techMD', self.STUB_MDWRAP)
techmd_old.replace_with(techmd_new)
assert techmd_old.get_status() is 'superseded'
assert techmd_new.get_status() is 'current'
assert techmd_old.get_status() == 'superseded'
assert techmd_new.get_status() == 'current'

def test_replacement_sourcemd(self):
""" It should have no special behaviour replacing sourceMDs. """
Expand Down