From 62955db5b2b181fc03a288f0948d285c025b6441 Mon Sep 17 00:00:00 2001 From: Joel Dunham Date: Tue, 31 Jul 2018 12:44:33 -0700 Subject: [PATCH 1/7] Accept missing mets:metsHdr --- metsrw/mets.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/metsrw/mets.py b/metsrw/mets.py index a927396..65b9696 100755 --- a/metsrw/mets.py +++ b/metsrw/mets.py @@ -445,10 +445,15 @@ def _parse_tree(self, tree=None): tree = self.tree # self._validate() # Check CREATEDATE < now - createdate = self.tree.find('mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE') + try: + createdate = self.tree.find( + 'mets:metsHdr', namespaces=utils.NAMESPACES).get('CREATEDATE') + except AttributeError: + createdate = None now = datetime.utcnow().isoformat('T') if createdate and createdate > now: - raise exceptions.ParseError('CREATEDATE more recent than now (%s)' % now) + raise exceptions.ParseError( + 'CREATEDATE more recent than now (%s)' % now) self.createdate = createdate # Parse structMap From c6443dde7bf3a57d5583911ed19ec55347ec94cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 30 Nov 2018 11:23:01 -0800 Subject: [PATCH 2/7] Accept dirs with fptrs This commit updates `_parse_tree_structmap` so `FSEntry` objects are created also when the parser runs into direct `fptr` elements, e.g.:
--- fixtures/mets_dir_with_fptrs.xml | 1309 ++++++++++++++++++++++++++++++ metsrw/fsentry.py | 14 + metsrw/mets.py | 114 +-- requirements/dev.txt | 1 + tests/test_mets.py | 53 ++ 5 files changed, 1441 insertions(+), 50 deletions(-) create mode 100644 fixtures/mets_dir_with_fptrs.xml diff --git a/fixtures/mets_dir_with_fptrs.xml b/fixtures/mets_dir_with_fptrs.xml new file mode 100644 index 0000000..a4f3ffc --- /dev/null +++ b/fixtures/mets_dir_with_fptrs.xml @@ -0,0 +1,1309 @@ + + + + + + + + + UUID + 3a6a182a-40a0-4c2b-9752-fc7e91ac1edf + + + 0 + + sha256 + 7bab5874a44f22e9fb7240cb70e674d12ae4d61a96db15e7cf0755d55db33d81 + + 50547000 + + + MPEG-1 Video Format + + + + PRONOM + x-fmt/385 + + + + + MPEG-2 Video Format + + + + PRONOM + x-fmt/386 + + + + + + + + + + + x-fmt/385 + + + + x-fmt/386 + + + + + + + 2008:04:17 20:14:40-07:00 + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + 50547000 + 0832c689a3b8a0c0919e6f012f92a78e + 1208488480000 + + + + + + + + + + BYTESTREAM + 2008-04-17T20:14:40-07:00 + 50547000 + bytestream + Well-Formed and valid + application/octet-stream + + + + + MPEG sequence, v2, program multiplex +application/octet-stream; charset=binary + application/octet-stream + MPEG sequence, v2, program multiplex + + + + + ExifToolVersion 7.74 +FileName V00154.MPG +Directory /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects +FileSize 48 MB +FileModifyDate 2008:04:17 20:14:40-07:00 +FileType MPEG +MIMEType video/mpeg +ImageWidth 480 +ImageHeight 480 +AspectRatio 0.6735 +FrameRate 29.97 fps +VideoBitrate 2376000 +MPEGAudioVersion 1 +AudioLayer 2 +AudioBitrate 224000 +SampleRate 44100 +ChannelMode Stereo +ModeExtension Bands 4-31 +CopyrightFlag False +OriginalMedia False +Emphasis None +Duration 02:35 (approx) +ImageSize 480x480 + 7.74 + V00154.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects + 48 MB + 2008:04:17 20:14:40-07:00 + MPEG + video/mpeg + 480 + 480 + 0.6735 + 29.97 fps + 2376000 + 1 + 2 + 224000 + 44100 + Stereo + Bands 4-31 + False + False + None + 02:35 (approx) + 480x480 + + + + + 3.0 + 35 + 2012-01-17T12:17:11 + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + + Positive (Generic Format) + MPEG-1 Video Format + x-fmt/385 + video/mpeg + + + Positive (Generic Format) + MPEG-2 Video Format + x-fmt/386 + video/mpeg + + + + + + + + V00154.MPG + / + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects + /home/archivesuser/archivematica/src/MCPServer/sharedDirectoryStructure/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + true + false + 50547000 + false + true + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + true + true + MPG + 2008-04-17 20:14:40 + 20080417 + yyyyMMdd + + HHmmssSSS + video/mpeg + null + null + null + unknown + + + + + + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00154.MPG + 50547000 + 0832c689a3b8a0c0919e6f012f92a78e + 1208488480000 + + + + + + + + + MPEG + MPEG video + video + + video/mpeg + + + mpg + mpeg + + + + + + + + %transferDirectory%objects/V00154.MPG + + + + + + + + + + UUID + 3b822067-8878-418b-881e-96303a6f60d3 + + ingestion + 2012-01-17T20:16:38 + + + + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 60f8651a-ddd0-4048-85a2-dcf4f828e3a5 + + message digest calculation + 2012-01-17T20:16:38 + program="python"; module="hashlib.sha256()" + + + + 7bab5874a44f22e9fb7240cb70e674d12ae4d61a96db15e7cf0755d55db33d81 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + f9c5af55-e4a7-4e34-acf4-10a57e74cf70 + + virus check + 2012-01-17T20:16:58 + program="Clam AV"; version="ClamAV 0.96.5"; virusDefinitions="14142/Mon Dec 19 09:17:31 2011 +" + + Pass + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 2744faaf-2e9b-4efe-aec2-4a1e28a652d4 + + format identification + 2012-01-17T20:16:58 + program="Droid"; version="3.0" + + Positive + + x-fmt/385 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + b828efeb-d285-496b-90e0-68e46213e4a0 + + format identification + 2012-01-17T20:16:58 + program="Droid"; version="3.0" + + Positive + + x-fmt/386 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 111ff912-2590-4b66-ace9-24e58ea8be20 + + validation + 2012-01-17T20:16:58 + program="Jhove"; version="1.5" + + pass + + format="bytestream"; result="Well-Formed and valid" + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 165ea4af-269b-40ca-9fa1-a3e042daafba + + fixity check + 2012-01-17T20:20:16 + program="python"; module="hashlib.sha256()" + + Pass + + 7bab5874a44f22e9fb7240cb70e674d12ae4d61a96db15e7cf0755d55db33d81verified + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + preservation system + Archivematica-0.8 + + Archivematica + software + + + + + + + + + + repository code + ORG + + Your Organization Name Here + organization + + + + + + + + + + + + UUID + 431913ba-4379-4373-8798-cc5f2b9dd769 + + + 0 + + sha256 + 7c91d344e65c549b2fcba47fb28adaaec3fa45354c5d2714e7a36e97ac72bf3f + + 62648324 + + + MPEG-1 Video Format + + + + PRONOM + x-fmt/385 + + + + + MPEG-2 Video Format + + + + PRONOM + x-fmt/386 + + + + + + + + + + + x-fmt/385 + + + + x-fmt/386 + + + + + + + 2009:03:04 16:21:16-08:00 + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + 62648324 + 0b6b6d0b09528e69004d0d2db01685e2 + 1236212476000 + + + + + + + + + + BYTESTREAM + 2009-03-04T16:21:16-08:00 + 62648324 + bytestream + Well-Formed and valid + application/octet-stream + + + + + MPEG sequence, v2, program multiplex +application/octet-stream; charset=binary + application/octet-stream + MPEG sequence, v2, program multiplex + + + + + ExifToolVersion 7.74 +FileName V00158.MPG +Directory /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects +FileSize 60 MB +FileModifyDate 2009:03:04 16:21:16-08:00 +FileType MPEG +MIMEType video/mpeg +ImageWidth 640 +ImageHeight 480 +AspectRatio 0.6735 +FrameRate 29.97 fps +VideoBitrate 6000000 +MPEGAudioVersion 1 +AudioLayer 2 +AudioBitrate 224000 +SampleRate 48000 +ChannelMode Stereo +ModeExtension Bands 4-31 +CopyrightFlag False +OriginalMedia False +Emphasis None +Duration 01:20 (approx) +ImageSize 640x480 + 7.74 + V00158.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects + 60 MB + 2009:03:04 16:21:16-08:00 + MPEG + video/mpeg + 640 + 480 + 0.6735 + 29.97 fps + 6000000 + 1 + 2 + 224000 + 48000 + Stereo + Bands 4-31 + False + False + None + 01:20 (approx) + 640x480 + + + + + 3.0 + 35 + 2012-01-17T12:17:11 + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + + Positive (Generic Format) + MPEG-1 Video Format + x-fmt/385 + video/mpeg + + + Positive (Generic Format) + MPEG-2 Video Format + x-fmt/386 + video/mpeg + + + + + + + + V00158.MPG + / + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects + /home/archivesuser/archivematica/src/MCPServer/sharedDirectoryStructure/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + true + false + 62648324 + false + true + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + true + true + MPG + 2009-03-04 16:21:16 + 20090304 + yyyyMMdd + + HHmmssSSS + video/mpeg + null + null + null + unknown + + + + + + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/V00158.MPG + 62648324 + 0b6b6d0b09528e69004d0d2db01685e2 + 1236212476000 + + + + + + + + + MPEG + MPEG video + video + + video/mpeg + + + mpg + mpeg + + + + + + + + %transferDirectory%objects/V00158.MPG + + + + + + + + + + UUID + df824ad3-3e35-4d90-be9c-f88d164814ce + + ingestion + 2012-01-17T20:16:54 + + + + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + d8114ef7-5317-4351-bb3d-6d9081f977a9 + + message digest calculation + 2012-01-17T20:16:54 + program="python"; module="hashlib.sha256()" + + + + 7c91d344e65c549b2fcba47fb28adaaec3fa45354c5d2714e7a36e97ac72bf3f + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + b455b0cc-8b44-4b9a-b7c6-f73f34c458e4 + + virus check + 2012-01-17T20:16:41 + program="Clam AV"; version="ClamAV 0.96.5"; virusDefinitions="14130/Fri Dec 16 17:32:22 2011 +" + + Pass + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + d67cdd67-1469-4242-a07a-abf6aa8ffdc7 + + format identification + 2012-01-17T20:16:58 + program="Droid"; version="3.0" + + Positive + + x-fmt/385 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + bdbf2da0-fa93-4a74-936c-65fc36aafe75 + + format identification + 2012-01-17T20:16:58 + program="Droid"; version="3.0" + + Positive + + x-fmt/386 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + fe7539de-9abf-4e92-b1e2-5e0f4292cf3e + + validation + 2012-01-17T20:16:58 + program="Jhove"; version="1.5" + + pass + + format="bytestream"; result="Well-Formed and valid" + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 6c710d31-ac3e-45e0-8bde-fe0477b9463c + + fixity check + 2012-01-17T20:20:00 + program="python"; module="hashlib.sha256()" + + Pass + + 7c91d344e65c549b2fcba47fb28adaaec3fa45354c5d2714e7a36e97ac72bf3fverified + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + preservation system + Archivematica-0.8 + + Archivematica + software + + + + + + + + + + repository code + ORG + + Your Organization Name Here + organization + + + + + + + + + + + + UUID + fc0e52ca-a688-41c0-a10b-c1d36e21e804 + + + 0 + + sha256 + 0dff86ef77411a8f6bcaf453fab5f1c9c88bd4ed7da526827d982cf767eaacfb + + 38 + + + Comma Separated Values + + + + PRONOM + x-fmt/18 + + + + + + + + + + + + x-fmt/18 + + + + 38 + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + 3676bafad0d15883ffe4ef5a32c6c352 + 1324662903000 + + + true + true + + + + CR/LF + US-ASCII + + + + + + + + + ASCII-hul + 2011-12-23T09:55:03-08:00 + 38 + ASCII + Well-Formed and valid + text/plain; charset=US-ASCII + + + + + ASCII text, with CRLF line terminators +text/plain; charset=us-ascii + text/plain + Plain text + US-ASCII + CR/LF + + + + + Error: Unknown file type - /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + + + + + 3.0 + 35 + 2012-01-17T12:16:50 + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + + Tentative + Comma Separated Values + x-fmt/18 + text/csv + + + + + + + + AM68.csv + / + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects + /home/archivesuser/archivematica/src/MCPServer/sharedDirectoryStructure/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + true + false + 38 + false + true + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + file:/var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + true + true + csv + 2011-12-23 09:55:03 + 20111223 + yyyyMMdd + + HHmmssSSS + file/unknown + null + null + null + unknown + + + + + + + /var/archivematica/sharedDirectory/currentlyProcessing/AM68-b02989ce-5f1a-4274-bfc6-3e7e61e9b8ed/objects/AM68.csv + 38 + 3676bafad0d15883ffe4ef5a32c6c352 + 1324662903000 + + + + + + + + + + Unknown Binary + + + application/octet-stream + + + + + + + + + %transferDirectory%objects/AM68.csv + + + + + + + + + + UUID + 9d744150-2ea2-4995-97ad-ac9e6fcfa5d0 + + ingestion + 2012-01-17T20:16:54 + + + + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 8295cf03-44b6-4dbe-b1b1-b083c372ce28 + + message digest calculation + 2012-01-17T20:16:54 + program="python"; module="hashlib.sha256()" + + + + 0dff86ef77411a8f6bcaf453fab5f1c9c88bd4ed7da526827d982cf767eaacfb + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 2a5d15cb-9c1e-4ba9-9f31-c2e67d41f852 + + virus check + 2012-01-17T20:16:58 + program="Clam AV"; version="ClamAV 0.96.5"; virusDefinitions="14142/Mon Dec 19 09:17:31 2011 +" + + Pass + + + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 889ed287-e376-4ed8-bcf4-458cc8f699d7 + + format identification + 2012-01-17T20:16:42 + program="Droid"; version="3.0" + + Tentative + + x-fmt/18 + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + 0bd4f32b-6d7c-49f6-b7e1-cbe624cc5450 + + validation + 2012-01-17T20:16:42 + program="Jhove"; version="1.5" + + pass + + format="ASCII"; result="Well-Formed and valid" + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + UUID + c46c6d84-4ba2-4c0e-95f1-6ba05178f335 + + fixity check + 2012-01-17T20:20:00 + program="python"; module="hashlib.sha256()" + + Pass + + 0dff86ef77411a8f6bcaf453fab5f1c9c88bd4ed7da526827d982cf767eaacfbverified + + + + preservation system + Archivematica-0.8 + + + repository code + ORG + + + + + + + + + + + preservation system + Archivematica-0.8 + + Archivematica + software + + + + + + + + + + repository code + ORG + + Your Organization Name Here + organization + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+
+
+
diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py index 0103146..02c715d 100644 --- a/metsrw/fsentry.py +++ b/metsrw/fsentry.py @@ -135,6 +135,20 @@ def __init__(self, path=None, label=None, use='original', type=u'Item', self.amdsecs = [] self.dmdsecs = [] + @classmethod + def dir(cls, label, children): + """Return ``FSEntry`` directory object.""" + return FSEntry( + label=label, children=children, type=u"Directory", use=None) + + @classmethod + def from_fptr(cls, label, type_, fptr): + """Return ``FSEntry`` object.""" + return FSEntry( + label=label, type=type_, path=fptr.path, use=fptr.use, + file_uuid=fptr.file_uuid, derived_from=fptr.derived_from, + checksum=fptr.checksum, checksumtype=fptr.checksumtype) + def __str__(self): return '{s.type}: {s.path}'.format(s=self) diff --git a/metsrw/mets.py b/metsrw/mets.py index 65b9696..5eb776b 100755 --- a/metsrw/mets.py +++ b/metsrw/mets.py @@ -330,14 +330,27 @@ def _parse_tree_structmap(self, tree, parent_elem, continue # Only handle divs, not fptrs entry_type = elem.get('TYPE') label = elem.get('LABEL') - fptr = self._analyze_fptr(elem, tree, entry_type) - children = self._parse_tree_structmap( - tree, elem, normative_parent_elem=normative_elem) - fs_entry = fsentry.FSEntry( - path=fptr.path, label=label, use=fptr.use, type=entry_type, - children=children, file_uuid=fptr.file_uuid, - derived_from=fptr.derived_from, checksum=fptr.checksum, - checksumtype=fptr.checksumtype) + fptr_elems = elem.findall('mets:fptr', namespaces=utils.NAMESPACES) + # Directories are walked recursively. Additionally, they may + # contain direct fptrs. + if entry_type.lower() == "directory": + children = self._parse_tree_structmap( + tree, elem, normative_parent_elem=normative_elem) + fs_entry = fsentry.FSEntry.dir(label, children) + self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree) + siblings.append(fs_entry) + for fptr_elem in fptr_elems: + fptr = self._analyze_fptr(fptr_elem, tree, entry_type) + fs_entry = fsentry.FSEntry.from_fptr( + label=None, type_=u"Item", fptr=fptr) + self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree) + siblings.append(fs_entry) + continue + # Other types, e.g.: items, aips... + if not len(fptr_elems): + continue + fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type) + fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr) self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree) self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree) siblings.append(fs_entry) @@ -369,48 +382,49 @@ def _get_el_to_normative(parent_elem, normative_parent_elem): return el_to_normative @staticmethod - def _analyze_fptr(elem, tree, entry_type): - fptr = elem.find('mets:fptr', namespaces=utils.NAMESPACES) - if fptr is None: - return FPtr(*[None] * 7) - else: - file_uuid = derived_from = use = path = amdids = checksum = \ - checksumtype = None - file_id = fptr.get('FILEID') - file_elem = tree.find( - 'mets:fileSec//mets:file[@ID="' + file_id + '"]', - namespaces=utils.NAMESPACES) - if file_elem is None: - raise exceptions.ParseError( - '%s exists in structMap but not fileSec' % file_id) - use = file_elem.getparent().get('USE') - path = file_elem.find( - 'mets:FLocat', namespaces=utils.NAMESPACES).get( - utils.lxmlns('xlink') + 'href') - try: - path = utils.urldecode(path) - except ValueError: - raise exceptions.ParseError( - 'Value "{}" (of attribute xlink:href) is not a valid' - ' URL.'.format(path)) - amdids = file_elem.get('ADMID') - checksum = file_elem.get('CHECKSUM') - checksumtype = file_elem.get('CHECKSUMTYPE') - file_id_prefix = utils.FILE_ID_PREFIX - # If the file is an AIP, then its prefix is not "file-" but the - # name of the AIP. Therefore we need to get the extension-less - # basename of the AIP's path and remove its UUID suffix to ge - # the prefix to remove from the FILEID attribute value. - if entry_type.lower() == 'archival information package': - file_id_prefix = os.path.splitext( - os.path.basename(path))[0][:-36] - file_uuid = file_id.replace(file_id_prefix, '', 1) - group_uuid = file_elem.get('GROUPID', '').replace( - utils.GROUP_ID_PREFIX, '', 1) - if group_uuid != file_uuid: - derived_from = group_uuid # Use group_uuid as placeholder - return FPtr(file_uuid, derived_from, use, path, amdids, - checksum, checksumtype) + def _analyze_fptr(fptr_elem, tree, entry_type): + file_uuid = derived_from = use = path = amdids = checksum = \ + checksumtype = None + file_id = fptr_elem.get('FILEID') + file_elem = tree.find( + 'mets:fileSec//mets:file[@ID="' + file_id + '"]', + namespaces=utils.NAMESPACES) + if file_elem is None: + raise exceptions.ParseError( + '%s exists in structMap but not fileSec' % file_id) + use = file_elem.getparent().get('USE') + path = file_elem.find( + 'mets:FLocat', namespaces=utils.NAMESPACES).get( + utils.lxmlns('xlink') + 'href') + try: + path = utils.urldecode(path) + except ValueError: + raise exceptions.ParseError( + 'Value "{}" (of attribute xlink:href) is not a valid' + ' URL.'.format(path)) + amdids = file_elem.get('ADMID') + checksum = file_elem.get('CHECKSUM') + checksumtype = file_elem.get('CHECKSUMTYPE') + file_id_prefix = utils.FILE_ID_PREFIX + # If the file is an AIP, then its prefix is not "file-" but the + # name of the AIP. Therefore we need to get the extension-less + # basename of the AIP's path and remove its UUID suffix to ge + # the prefix to remove from the FILEID attribute value. + if entry_type.lower() == 'archival information package': + file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36] + # If the file is part of a directory (with no intermediate item), then + # its prefix *may not* be "file-" but the name of the file. This + # pattern is found in old Archivematica METS files, e.g. see + # ``fixtures/mets_dir_with_many_ptrs.xml``. + elif entry_type.lower() == 'directory' and file_id[:5] != "file-": + file_id_prefix = os.path.basename(path) + "-" + file_uuid = file_id.replace(file_id_prefix, '', 1) + group_uuid = file_elem.get('GROUPID', '').replace( + utils.GROUP_ID_PREFIX, '', 1) + if group_uuid != file_uuid: + derived_from = group_uuid # Use group_uuid as placeholder + return FPtr(file_uuid, derived_from, use, path, amdids, + checksum, checksumtype) @staticmethod def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree): diff --git a/requirements/dev.txt b/requirements/dev.txt index d3a09bb..5b6f08e 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,4 +1,5 @@ -r base.txt +mock pytest pytest-cov sphinx>=1.3 diff --git a/tests/test_mets.py b/tests/test_mets.py index 6acb119..15dc2de 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -4,6 +4,7 @@ from lxml import etree from lxml.builder import ElementMaker import os +import mock import pytest from unittest import TestCase import uuid @@ -158,6 +159,46 @@ def test_fromfile_invalid_xlink_href(self): metsrw.METSDocument.fromfile( 'fixtures/mets_invalid_xlink_hrefs.xml') + def test_analyze_fptr(self): + parser = etree.XMLParser(remove_blank_text=True) + tree = etree.parse('fixtures/mets_dir_with_fptrs.xml', parser=parser) + mw = metsrw.METSDocument() + + # Test that exception is raised when fileSec cannot be found. + fptr_elem = etree.fromstring('') + with pytest.raises(metsrw.exceptions.ParseError, + match='12345 exists in structMap but not fileSec'): + metsrw.METSDocument._analyze_fptr(fptr_elem, tree, 'directory') + + # Test that exception is raised when the path cannot be decoded. + fptr_elem = etree.fromstring( + '') + with mock.patch('metsrw.utils.urldecode') as urldecode: + urldecode.side_effect = ValueError() + with pytest.raises(metsrw.exceptions.ParseError, + match='is not a valid URL'): + metsrw.METSDocument._analyze_fptr(fptr_elem, tree, 'directory') + + # Test the integrity of the ``FPtr`` object returned. + fptr = mw._analyze_fptr(fptr_elem, tree, 'directory') + assert fptr == metsrw.mets.FPtr( + file_uuid='fc0e52ca-a688-41c0-a10b-c1d36e21e804', + derived_from=None, use='original', path='objects/AM68.csv', + amdids='amdSec_3', checksum=None, checksumtype=None) + + def test_analyze_fptr_from_aip(self): + parser = etree.XMLParser(remove_blank_text=True) + tree = etree.parse( + 'fixtures/production-pointer-file.xml', parser=parser) + mw = metsrw.METSDocument() + + fptr_elem = tree.find( + '//mets:fptr[1]', namespaces=metsrw.utils.NAMESPACES) + fptr = mw._analyze_fptr( + fptr_elem, tree, 'Archival Information Package') + assert fptr.file_uuid == '7327b00f-d83a-4ae8-bb89-84fce994e827' + assert fptr.use == 'Archival Information Package' + class TestWholeMETS(TestCase): """ Test integration between classes. """ @@ -587,6 +628,18 @@ def test_parse_production_pointer_file(self): aip_uuid = '7327b00f-d83a-4ae8-bb89-84fce994e827' assert mw.get_file(file_uuid=aip_uuid) + def test_parse_dir_with_fptrs(self): + mets_path = 'fixtures/mets_dir_with_fptrs.xml' + mw = metsrw.METSDocument.fromfile(mets_path) + assert len(mw.all_files()) == 5 + assert mw.get_file(type='Directory', label='objects') + for item in ( + ['3a6a182a-40a0-4c2b-9752-fc7e91ac1edf', 'objects/V00154.MPG'], + ['431913ba-4379-4373-8798-cc5f2b9dd769', 'objects/V00158.MPG'], + ['fc0e52ca-a688-41c0-a10b-c1d36e21e804', 'objects/AM68.csv'], + ): + assert mw.get_file(type='Item', file_uuid=item[0], path=item[1]) + # Helper methods def assert_mets_valid(self, mets_doc, schematron=metsrw.AM_SCT_PATH): From e7a584b1e83fd115191b7a323404e466a0b9908a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Mon, 3 Dec 2018 13:17:03 -0800 Subject: [PATCH 3/7] premisrw: add PREMIS v2.1 --- metsrw/plugins/premisrw/__init__.py | 9 +++++++++ metsrw/plugins/premisrw/utils.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/metsrw/plugins/premisrw/__init__.py b/metsrw/plugins/premisrw/__init__.py index 69d2bc2..e356bc3 100644 --- a/metsrw/plugins/premisrw/__init__.py +++ b/metsrw/plugins/premisrw/__init__.py @@ -18,6 +18,12 @@ ) from .utils import ( XSI_NAMESPACE, + PREMIS_2_1_VERSION, + PREMIS_2_1_NAMESPACE, + PREMIS_2_1_XSD, + PREMIS_2_1_SCHEMA_LOCATION, + PREMIS_2_1_NAMESPACES, + PREMIS_2_1_META, PREMIS_2_2_VERSION, PREMIS_2_2_NAMESPACE, PREMIS_2_2_XSD, @@ -48,6 +54,9 @@ __all__ = ['PREMISElement', 'PREMISObject', 'PREMISEvent', 'PREMISAgent', 'data_to_premis', 'premis_to_data', 'data_find', 'data_find_all', 'data_find_text', 'data_find_text_or_all', 'XSI_NAMESPACE', + 'PREMIS_2_1_VERSION', 'PREMIS_2_1_NAMESPACE', 'PREMIS_2_1_XSD', + 'PREMIS_2_1_SCHEMA_LOCATION', 'PREMIS_2_1_NAMESPACES', + 'PREMIS_2_1_META', 'PREMIS_2_2_VERSION', 'PREMIS_2_2_NAMESPACE', 'PREMIS_2_2_XSD', 'PREMIS_2_2_SCHEMA_LOCATION', 'PREMIS_2_2_NAMESPACES', 'PREMIS_2_2_META', 'PREMIS_3_0_VERSION', 'PREMIS_3_0_NAMESPACE', diff --git a/metsrw/plugins/premisrw/utils.py b/metsrw/plugins/premisrw/utils.py index c997143..ad4dc58 100644 --- a/metsrw/plugins/premisrw/utils.py +++ b/metsrw/plugins/premisrw/utils.py @@ -3,6 +3,21 @@ XSI_NAMESPACE = 'http://www.w3.org/2001/XMLSchema-instance' +# PREMIS v. 2.1 +PREMIS_2_1_VERSION = '2.1' +PREMIS_2_1_NAMESPACE = 'info:lc/xmlns/premis-v1' +PREMIS_2_1_XSD = 'http://www.loc.gov/standards/premis/v2/premis-v2-1.xsd' +PREMIS_2_1_SCHEMA_LOCATION = '{} {}'.format( + PREMIS_2_1_NAMESPACE, PREMIS_2_1_XSD) +PREMIS_2_1_NAMESPACES = { + 'premis': PREMIS_2_1_NAMESPACE, + 'xsi': XSI_NAMESPACE +} +PREMIS_2_1_META = { + 'xsi:schema_location': PREMIS_2_1_SCHEMA_LOCATION, + 'version': PREMIS_2_1_VERSION +} + # PREMIS v. 2.2 PREMIS_2_2_VERSION = '2.2' PREMIS_2_2_NAMESPACE = 'info:lc/xmlns/premis-v2' @@ -34,6 +49,10 @@ } PREMIS_VERSIONS_MAP = { + PREMIS_2_1_VERSION: { + 'namespaces': PREMIS_2_2_NAMESPACES, + 'meta': PREMIS_2_1_META + }, PREMIS_2_2_VERSION: { 'namespaces': PREMIS_2_2_NAMESPACES, 'meta': PREMIS_2_2_META From 329c8e15625bfeea4ddc03a52db8b5a7218eee71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 8 Mar 2019 09:20:27 -0800 Subject: [PATCH 4/7] Fix FSEntry path encoding issue In Py2, ``FSEntry.path`` uses binary. This commit updates the constructor so it uses the `utf-8` encoder when we're encoding text type. The default encoder is `ascii` which is problematic. In Py3, ``FSEntry.path`` is using Unicode and that needs to be fixed. --- metsrw/fsentry.py | 14 ++++++++++---- tests/test_fsentry.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py index 02c715d..d609f62 100644 --- a/metsrw/fsentry.py +++ b/metsrw/fsentry.py @@ -105,10 +105,16 @@ def __init__(self, path=None, label=None, use='original', type=u'Item', # path can validly be any encoding; if this value needs # to be spliced later on, it's better to treat it as a # bytestring than as actually being encoded text. - # TODO update this with six and bytes - if path: - path = str(path) - self.path = path + if six.PY2: + if isinstance(path, six.text_type): + self.path = path.encode('utf-8') + else: + self.path = path + else: # TODO: Py3 is still using Unicode. + if isinstance(path, six.binary_type): + self.path = path.decode('utf-8', errors="strict") + else: + self.path = path if label is None and path is not None: label = os.path.basename(path) self.label = label diff --git a/tests/test_fsentry.py b/tests/test_fsentry.py index 3968e23..12ed751 100644 --- a/tests/test_fsentry.py +++ b/tests/test_fsentry.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- + import pytest from unittest import TestCase import uuid +import six import metsrw @@ -9,6 +11,15 @@ class TestFSEntry(TestCase): """ Test FSEntry class. """ + @pytest.mark.skipif(six.PY3, reason="metsrw still uses Unicode in python3") + def test_path_is_binary(self): + """It should store the ``path`` as a bytestring.""" + sample = u'💜🎑💜' + assert isinstance(metsrw.FSEntry( + sample, type='Directory').path, six.binary_type) + assert isinstance(metsrw.FSEntry( + sample.encode('utf-8'), type='Directory').path, six.binary_type) + def test_create_invalid_checksum_type(self): """ It should only accept METS valid checksum types. """ metsrw.FSEntry( From eac45cf8e2b565e70a6562d918328827e85c2b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 8 Mar 2019 09:46:18 -0800 Subject: [PATCH 5/7] Fix flake8 F632 warnings --- tests/test_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2daaa5d..18c5717 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -52,8 +52,8 @@ def test_replacement_techmd(self): techmd_old = metsrw.SubSection('techMD', self.STUB_MDWRAP) techmd_new = metsrw.SubSection('techMD', self.STUB_MDWRAP) techmd_old.replace_with(techmd_new) - assert techmd_old.get_status() is 'superseded' - assert techmd_new.get_status() is 'current' + assert techmd_old.get_status() == 'superseded' + assert techmd_new.get_status() == 'current' def test_replacement_sourcemd(self): """ It should have no special behaviour replacing sourceMDs. """ From f7ff13f4dcb79ba2bcab859a30ddb17f16eee0b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 8 Mar 2019 09:50:56 -0800 Subject: [PATCH 6/7] Bump version to 0.3.0 --- metsrw/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metsrw/__init__.py b/metsrw/__init__.py index 7b3e56e..716a0c2 100644 --- a/metsrw/__init__.py +++ b/metsrw/__init__.py @@ -43,7 +43,7 @@ LOGGER = logging.getLogger(__name__) LOGGER.addHandler(logging.NullHandler()) -__version__ = '0.2.4' +__version__ = '0.3.0' __all__ = [ 'AMDSec', From ae2950e0fc3cb70cab323d08e7ae91bc137adc6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Fri, 8 Mar 2019 12:44:27 -0800 Subject: [PATCH 7/7] Use utf-8 codec in premisrw This avoids the following issue: > return ' '.join([str(x) for x in texts]) ^^^^^^ E UnicodeEncodeError: 'ascii' codec can't encode characters in position 42-44: ordinal not in range(128) This should work in both Py2 and Py3. --- metsrw/plugins/premisrw/premis.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metsrw/plugins/premisrw/premis.py b/metsrw/plugins/premisrw/premis.py index ee25dda..a615a71 100644 --- a/metsrw/plugins/premisrw/premis.py +++ b/metsrw/plugins/premisrw/premis.py @@ -694,7 +694,8 @@ def data_find_text(data, path): texts = [child for child in el[1:] if not isinstance(child, (tuple, list, dict))] if texts: - return ' '.join([str(x) for x in texts]) + return ' '.join( + [x.encode('utf-8', errors='ignore') for x in texts]) return None