artefactual-labs · sevein · Mar 8, 2019 · Jul 31, 2018 · Nov 30, 2018 · Dec 3, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,10 @@
 sudo: false
 language: python
 
+branches:
+  only:
+    - "master"
+
 install:
   - pip install tox coveralls
 

diff --git a/fixtures/mets_dir_with_fptrs.xml b/fixtures/mets_dir_with_fptrs.xml
diff --git a/metsrw/__init__.py b/metsrw/__init__.py
@@ -43,7 +43,7 @@
 
 LOGGER = logging.getLogger(__name__)
 LOGGER.addHandler(logging.NullHandler())
-__version__ = '0.2.4'
+__version__ = '0.3.0'
 
 __all__ = [
     'AMDSec',

diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py
@@ -105,10 +105,16 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
         # path can validly be any encoding; if this value needs
         # to be spliced later on, it's better to treat it as a
         # bytestring than as actually being encoded text.
-        # TODO update this with six and bytes
-        if path:
-            path = str(path)
-        self.path = path
+        if six.PY2:
+            if isinstance(path, six.text_type):
+                self.path = path.encode('utf-8')
+            else:
+                self.path = path
+        else:  # TODO: Py3 is still using Unicode.
+            if isinstance(path, six.binary_type):
+                self.path = path.decode('utf-8', errors="strict")
+            else:
+                self.path = path
         if label is None and path is not None:
             label = os.path.basename(path)
         self.label = label
@@ -135,6 +141,20 @@ def __init__(self, path=None, label=None, use='original', type=u'Item',
         self.amdsecs = []
         self.dmdsecs = []
 
+    @classmethod
+    def dir(cls, label, children):
+        """Return ``FSEntry`` directory object."""
+        return FSEntry(
+            label=label, children=children, type=u"Directory", use=None)
+
+    @classmethod
+    def from_fptr(cls, label, type_, fptr):
+        """Return ``FSEntry`` object."""
+        return FSEntry(
+            label=label, type=type_, path=fptr.path, use=fptr.use,
+            file_uuid=fptr.file_uuid, derived_from=fptr.derived_from,
+            checksum=fptr.checksum, checksumtype=fptr.checksumtype)
+
     def __str__(self):
         return '{s.type}: {s.path}'.format(s=self)
 

diff --git a/metsrw/mets.py b/metsrw/mets.py
@@ -330,14 +330,27 @@ def _parse_tree_structmap(self, tree, parent_elem,
                 continue  # Only handle divs, not fptrs
             entry_type = elem.get('TYPE')
             label = elem.get('LABEL')
-            fptr = self._analyze_fptr(elem, tree, entry_type)
-            children = self._parse_tree_structmap(
-                tree, elem, normative_parent_elem=normative_elem)
-            fs_entry = fsentry.FSEntry(
-                path=fptr.path, label=label, use=fptr.use, type=entry_type,
-                children=children, file_uuid=fptr.file_uuid,
-                derived_from=fptr.derived_from, checksum=fptr.checksum,
-                checksumtype=fptr.checksumtype)
+            fptr_elems = elem.findall('mets:fptr', namespaces=utils.NAMESPACES)
+            # Directories are walked recursively. Additionally, they may
+            # contain direct fptrs.
+            if entry_type.lower() == "directory":
+                children = self._parse_tree_structmap(
+                    tree, elem, normative_parent_elem=normative_elem)
+                fs_entry = fsentry.FSEntry.dir(label, children)
+                self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
+                siblings.append(fs_entry)
+                for fptr_elem in fptr_elems:
+                    fptr = self._analyze_fptr(fptr_elem, tree, entry_type)
+                    fs_entry = fsentry.FSEntry.from_fptr(
+                        label=None, type_=u"Item", fptr=fptr)
+                    self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
+                    siblings.append(fs_entry)
+                continue
+            # Other types, e.g.: items, aips...
+            if not len(fptr_elems):
+                continue
+            fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type)
+            fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr)
             self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
             self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
             siblings.append(fs_entry)
@@ -369,48 +382,49 @@ def _get_el_to_normative(parent_elem, normative_parent_elem):
         return el_to_normative
 
     @staticmethod
-    def _analyze_fptr(elem, tree, entry_type):
-        fptr = elem.find('mets:fptr', namespaces=utils.NAMESPACES)
-        if fptr is None:
-            return FPtr(*[None] * 7)
-        else:
-            file_uuid = derived_from = use = path = amdids = checksum = \
-                checksumtype = None
-            file_id = fptr.get('FILEID')
-            file_elem = tree.find(
-                'mets:fileSec//mets:file[@ID="' + file_id + '"]',
-                namespaces=utils.NAMESPACES)
-            if file_elem is None:
-                raise exceptions.ParseError(
-                    '%s exists in structMap but not fileSec' % file_id)
-            use = file_elem.getparent().get('USE')
-            path = file_elem.find(
-                'mets:FLocat', namespaces=utils.NAMESPACES).get(
-                    utils.lxmlns('xlink') + 'href')
-            try:
-                path = utils.urldecode(path)
-            except ValueError:
-                raise exceptions.ParseError(
-                    'Value "{}" (of attribute xlink:href) is not a valid'
-                    ' URL.'.format(path))
-            amdids = file_elem.get('ADMID')
-            checksum = file_elem.get('CHECKSUM')
-            checksumtype = file_elem.get('CHECKSUMTYPE')
-            file_id_prefix = utils.FILE_ID_PREFIX
-            # If the file is an AIP, then its prefix is not "file-" but the
-            # name of the AIP. Therefore we need to get the extension-less
-            # basename of the AIP's path and remove its UUID suffix to ge
-            # the prefix to remove from the FILEID attribute value.
-            if entry_type.lower() == 'archival information package':
-                file_id_prefix = os.path.splitext(
-                    os.path.basename(path))[0][:-36]
-            file_uuid = file_id.replace(file_id_prefix, '', 1)
-            group_uuid = file_elem.get('GROUPID', '').replace(
-                utils.GROUP_ID_PREFIX, '', 1)
-            if group_uuid != file_uuid:
-                derived_from = group_uuid  # Use group_uuid as placeholder
-            return FPtr(file_uuid, derived_from, use, path, amdids,
-                        checksum, checksumtype)
+    def _analyze_fptr(fptr_elem, tree, entry_type):
+        file_uuid = derived_from = use = path = amdids = checksum = \
+            checksumtype = None
+        file_id = fptr_elem.get('FILEID')
+        file_elem = tree.find(
+            'mets:fileSec//mets:file[@ID="' + file_id + '"]',
+            namespaces=utils.NAMESPACES)
+        if file_elem is None:
+            raise exceptions.ParseError(
+                '%s exists in structMap but not fileSec' % file_id)
+        use = file_elem.getparent().get('USE')
+        path = file_elem.find(
+            'mets:FLocat', namespaces=utils.NAMESPACES).get(
+                utils.lxmlns('xlink') + 'href')
+        try:
+            path = utils.urldecode(path)
+        except ValueError:
+            raise exceptions.ParseError(
+                'Value "{}" (of attribute xlink:href) is not a valid'
+                ' URL.'.format(path))
+        amdids = file_elem.get('ADMID')
+        checksum = file_elem.get('CHECKSUM')
+        checksumtype = file_elem.get('CHECKSUMTYPE')
+        file_id_prefix = utils.FILE_ID_PREFIX
+        # If the file is an AIP, then its prefix is not "file-" but the
+        # name of the AIP. Therefore we need to get the extension-less
+        # basename of the AIP's path and remove its UUID suffix to ge
+        # the prefix to remove from the FILEID attribute value.
+        if entry_type.lower() == 'archival information package':
+            file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36]
+        # If the file is part of a directory (with no intermediate item), then
+        # its prefix *may not* be "file-" but the name of the file. This
+        # pattern is found in old Archivematica METS files, e.g. see
+        # ``fixtures/mets_dir_with_many_ptrs.xml``.
+        elif entry_type.lower() == 'directory' and file_id[:5] != "file-":
+            file_id_prefix = os.path.basename(path) + "-"
+        file_uuid = file_id.replace(file_id_prefix, '', 1)
+        group_uuid = file_elem.get('GROUPID', '').replace(
+            utils.GROUP_ID_PREFIX, '', 1)
+        if group_uuid != file_uuid:
+            derived_from = group_uuid  # Use group_uuid as placeholder
+        return FPtr(file_uuid, derived_from, use, path, amdids,
+                    checksum, checksumtype)
 
     @staticmethod
     def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
@@ -445,10 +459,15 @@ def _parse_tree(self, tree=None):
             tree = self.tree
         # self._validate()
         # Check CREATEDATE < now
-        createdate = self.tree.find('mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
+        try:
+            createdate = self.tree.find(
+                'mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE')
+        except AttributeError:
+            createdate = None
         now = datetime.utcnow().isoformat('T')
         if createdate and createdate > now:
-            raise exceptions.ParseError('CREATEDATE more recent than now (%s)' % now)
+            raise exceptions.ParseError(
+                'CREATEDATE more recent than now (%s)' % now)
         self.createdate = createdate
 
         # Parse structMap

diff --git a/metsrw/plugins/premisrw/__init__.py b/metsrw/plugins/premisrw/__init__.py
@@ -18,6 +18,12 @@
 )
 from .utils import (
     XSI_NAMESPACE,
+    PREMIS_2_1_VERSION,
+    PREMIS_2_1_NAMESPACE,
+    PREMIS_2_1_XSD,
+    PREMIS_2_1_SCHEMA_LOCATION,
+    PREMIS_2_1_NAMESPACES,
+    PREMIS_2_1_META,
     PREMIS_2_2_VERSION,
     PREMIS_2_2_NAMESPACE,
     PREMIS_2_2_XSD,
@@ -48,6 +54,9 @@
 __all__ = ['PREMISElement', 'PREMISObject', 'PREMISEvent', 'PREMISAgent',
            'data_to_premis', 'premis_to_data', 'data_find', 'data_find_all',
            'data_find_text', 'data_find_text_or_all', 'XSI_NAMESPACE',
+           'PREMIS_2_1_VERSION', 'PREMIS_2_1_NAMESPACE', 'PREMIS_2_1_XSD',
+           'PREMIS_2_1_SCHEMA_LOCATION', 'PREMIS_2_1_NAMESPACES',
+           'PREMIS_2_1_META',
            'PREMIS_2_2_VERSION', 'PREMIS_2_2_NAMESPACE', 'PREMIS_2_2_XSD',
            'PREMIS_2_2_SCHEMA_LOCATION', 'PREMIS_2_2_NAMESPACES',
            'PREMIS_2_2_META', 'PREMIS_3_0_VERSION', 'PREMIS_3_0_NAMESPACE',

diff --git a/metsrw/plugins/premisrw/premis.py b/metsrw/plugins/premisrw/premis.py
@@ -690,12 +690,16 @@ def data_find_text(data, path):
     simplified XPath ``path``.
     """
     el = data_find(data, path)
-    if isinstance(el, (list, tuple)):
-        texts = [child for child in el[1:]
-                 if not isinstance(child, (tuple, list, dict))]
-        if texts:
-            return ' '.join([str(x) for x in texts])
-    return None
+    if not isinstance(el, (list, tuple)):
+        return None
+    texts = [child for child in el[1:]
+             if not isinstance(child, (tuple, list, dict))]
+    if not texts:
+        return None
+    if six.PY2:
+        return ' '.join(
+            [x.encode('utf-8', errors='ignore') for x in texts])
+    return ' '.join([str(x) for x in texts])
 
 
 def data_find_text_or_all(data, path, dyn_cls=False):

diff --git a/metsrw/plugins/premisrw/utils.py b/metsrw/plugins/premisrw/utils.py
@@ -3,6 +3,21 @@
 
 XSI_NAMESPACE = 'http://www.w3.org/2001/XMLSchema-instance'
 
+# PREMIS v. 2.1
+PREMIS_2_1_VERSION = '2.1'
+PREMIS_2_1_NAMESPACE = 'info:lc/xmlns/premis-v1'
+PREMIS_2_1_XSD = 'http://www.loc.gov/standards/premis/v2/premis-v2-1.xsd'
+PREMIS_2_1_SCHEMA_LOCATION = '{} {}'.format(
+    PREMIS_2_1_NAMESPACE, PREMIS_2_1_XSD)
+PREMIS_2_1_NAMESPACES = {
+    'premis': PREMIS_2_1_NAMESPACE,
+    'xsi': XSI_NAMESPACE
+}
+PREMIS_2_1_META = {
+    'xsi:schema_location': PREMIS_2_1_SCHEMA_LOCATION,
+    'version': PREMIS_2_1_VERSION
+}
+
 # PREMIS v. 2.2
 PREMIS_2_2_VERSION = '2.2'
 PREMIS_2_2_NAMESPACE = 'info:lc/xmlns/premis-v2'
@@ -34,6 +49,10 @@
 }
 
 PREMIS_VERSIONS_MAP = {
+    PREMIS_2_1_VERSION: {
+        'namespaces': PREMIS_2_2_NAMESPACES,
+        'meta': PREMIS_2_1_META
+    },
     PREMIS_2_2_VERSION: {
         'namespaces': PREMIS_2_2_NAMESPACES,
         'meta': PREMIS_2_2_META

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,4 +1,5 @@
 -r base.txt
+mock
 pytest
 pytest-cov
 sphinx>=1.3

diff --git a/tests/test_fsentry.py b/tests/test_fsentry.py
@@ -1,14 +1,25 @@
 # -*- coding: utf-8 -*-
+
 import pytest
 from unittest import TestCase
 import uuid
+import six
 
 import metsrw
 
 
 class TestFSEntry(TestCase):
     """ Test FSEntry class. """
 
+    @pytest.mark.skipif(six.PY3, reason="metsrw still uses Unicode in python3")
+    def test_path_is_binary(self):
+        """It should store the ``path`` as a bytestring."""
+        sample = u'💜🎑💜'
+        assert isinstance(metsrw.FSEntry(
+            sample, type='Directory').path, six.binary_type)
+        assert isinstance(metsrw.FSEntry(
+            sample.encode('utf-8'), type='Directory').path, six.binary_type)
+
     def test_create_invalid_checksum_type(self):
         """ It should only accept METS valid checksum types. """
         metsrw.FSEntry(

diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -52,8 +52,8 @@ def test_replacement_techmd(self):
         techmd_old = metsrw.SubSection('techMD', self.STUB_MDWRAP)
         techmd_new = metsrw.SubSection('techMD', self.STUB_MDWRAP)
         techmd_old.replace_with(techmd_new)
-        assert techmd_old.get_status() is 'superseded'
-        assert techmd_new.get_status() is 'current'
+        assert techmd_old.get_status() == 'superseded'
+        assert techmd_new.get_status() == 'current'
 
     def test_replacement_sourcemd(self):
         """ It should have no special behaviour replacing sourceMDs. """