diff --git a/CHANGELOG.md b/CHANGELOG.md index 56d16c2f24..8fc87a8225 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.28.0] - 2021-11-30 + +Added: + + * Store parameterization of processors in METS for provenance, #747 + * `ocrd workspace find --download`: Add a `--wait` option to wait between downloads, #745 + * bashlib: Check fileGrps when parsing CLI args, #743, OCR-D/ocrd_olena#76 + * Dockerfile: Install `time` to have `/usr/bin/time` in the image, #748, OCR-D/ocrd_all#271 + +Fixed: + + * `ocrd-dummy`: Also set pcGtsId, v0.0.2, #739 + ## [2.27.0] - 2021-11-09 Fixed: @@ -1381,6 +1394,7 @@ Fixed Initial Release +[2.28.0]: ../../compare/v2.28.0..v2.27.0 [2.27.0]: ../../compare/v2.27.0..v2.26.1 [2.26.1]: ../../compare/v2.26.1..v2.26.0 [2.26.0]: ../../compare/v2.26.0..v2.25.1 diff --git a/Dockerfile b/Dockerfile index d99cdef4dc..adda1017f2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,7 @@ RUN apt-get update && apt-get -y install --no-install-recommends \ python3-pip \ make \ wget \ + time \ curl \ sudo \ git \ diff --git a/README.md b/README.md index e6bfb46e35..2530b7a997 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ pip install ocrd pip install ocrd_modelfactory ``` -All python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.5 or higher. +All python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.6 or higher. ## Command line tools @@ -118,5 +118,6 @@ Test with local files: `make test` ## See Also - - [OCR-D Specifications](https://ocr-d.github.io) ([Repo](https://github.com/ocr-d/spec)) - - [OCR-D Documentation](https://ocr-d.github.io/docs) ([Repo](https://github.com/ocr-d/docs)) + - [OCR-D Specifications](https://https://ocr-d.de/en/spec/) ([Repo](https://github.com/ocr-d/spec)) + - [OCR-D core API documentation](https://ocr-d.de/core) (built here via `make docs`) + - [OCR-D Website](https://ocr-d.de) ([Repo](https://github.com/ocr-d/ocrd-website)) diff --git a/ocrd/bashlib/Makefile b/ocrd/bashlib/Makefile index fcd56d08f2..5a10b5c857 100644 --- a/ocrd/bashlib/Makefile +++ b/ocrd/bashlib/Makefile @@ -13,6 +13,7 @@ help: # Build the lib lib: lib.bash +.DELETE_ON_ERROR: lib.bash: $(wildcard src/*.bash) shinclude -c pound src/lib.bash > "$@" diff --git a/ocrd/bashlib/src/parse_argv.bash b/ocrd/bashlib/src/parse_argv.bash index 9ed027a0bc..542372166b 100644 --- a/ocrd/bashlib/src/parse_argv.bash +++ b/ocrd/bashlib/src/parse_argv.bash @@ -68,6 +68,18 @@ ocrd__parse_argv () { ocrd__raise "Provide --output-file-grp/-O explicitly!" fi + # check fileGrps + local _valopts=( --workspace "${ocrd__argv[working_dir]}" ) + if [[ ${ocrd__argv[overwrite]} = true ]]; then + _valopts+=( --overwrite ) + fi + if [[ -n "${ocrd__argv[page_id]:-}" ]]; then + _valopts+=( --page-id "${ocrd__argv[page_id]}" ) + fi + _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]}" ) + ocrd validate tasks "${_valopts[@]}" || exit $? + + # check parameters local params_parsed retval params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || { retval=$? diff --git a/ocrd/ocrd/cli/validate.py b/ocrd/ocrd/cli/validate.py index c31bb4b7c0..310f87f095 100644 --- a/ocrd/ocrd/cli/validate.py +++ b/ocrd/ocrd/cli/validate.py @@ -100,14 +100,16 @@ def validate_page(page, **kwargs): # _inform_of_result(WorkspaceValidator.validate(Resolver(), mets_url, **kwargs)) @validate_cli.command('tasks') -@click.option('--workspace', nargs=1, required=False, help='Workspace these tasks are to be run. If omitted, only validate syntax') +@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') +@click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') +@click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace): +def validate_process(tasks, workspace, overwrite, page_id): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace))) + _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace), page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index a46ac44824..4205964ec5 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -13,6 +13,7 @@ import sys from glob import glob # XXX pathlib.Path.glob does not support absolute globs import re +import time import click @@ -347,8 +348,9 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp 'local_filename', ])) @click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ") +@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace -def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download): +def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait): """ Find files. @@ -367,6 +369,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down if download and not f.local_filename: workspace.download_file(f) modified_mets = True + if wait: + time.sleep(wait) ret.append([f.ID if field == 'pageId' else getattr(f, field) or '' for field in output_field]) if modified_mets: diff --git a/ocrd/ocrd/lib.bash b/ocrd/ocrd/lib.bash index 3cd38f20ec..c8573b53bd 100644 --- a/ocrd/ocrd/lib.bash +++ b/ocrd/ocrd/lib.bash @@ -156,6 +156,18 @@ ocrd__parse_argv () { ocrd__raise "Provide --output-file-grp/-O explicitly!" fi + # check fileGrps + local _valopts=( --workspace "${ocrd__argv[working_dir]}" ) + if [[ ${ocrd__argv[overwrite]} = true ]]; then + _valopts+=( --overwrite ) + fi + if [[ -n "${ocrd__argv[page_id]:-}" ]]; then + _valopts+=( --page-id "${ocrd__argv[page_id]}" ) + fi + _valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]}" ) + ocrd validate tasks "${_valopts[@]}" || exit $? + + # check parameters local params_parsed retval params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || { retval=$? diff --git a/ocrd/ocrd/processor/builtin/dummy_processor.py b/ocrd/ocrd/processor/builtin/dummy_processor.py index 1f517e32f3..9a1ad511e7 100644 --- a/ocrd/ocrd/processor/builtin/dummy_processor.py +++ b/ocrd/ocrd/processor/builtin/dummy_processor.py @@ -21,7 +21,7 @@ class DummyProcessor(Processor): """ - Bare-bones processor that copies mets:file from input group to output group. + Bare-bones processor that only copies mets:file from input group to output group. """ def process(self): @@ -34,6 +34,7 @@ def process(self): ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) + pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: @@ -75,7 +76,7 @@ def process(self): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL - kwargs['version'] = '0.0.1' + kwargs['version'] = '0.0.2' super(DummyProcessor, self).__init__(*args, **kwargs) @click.command() diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 62129098e0..41eeb0638c 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -88,20 +88,25 @@ def run_processor( processor.process() t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu - logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % ( + logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( ocrd_tool['executable'], t1_wall, t1_cpu, - input_file_grp if input_file_grp else '', - output_file_grp if output_file_grp else '', - json.dumps(parameter) if parameter else {} + input_file_grp or '', + output_file_grp or '', + json.dumps(parameter) or '', + page_id or '' )) workspace.mets.add_agent( name=name, _type='OTHER', othertype='SOFTWARE', role='OTHER', - otherrole=otherrole + otherrole=otherrole, + notes=[({'option': 'input-file-grp'}, input_file_grp or ''), + ({'option': 'output-file-grp'}, output_file_grp or ''), + ({'option': 'parameter'}, json.dumps(parameter or '')), + ({'option': 'page-id'}, page_id or '')] ) workspace.save_mets() return processor diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py index 5297447eaa..6c8b0e1017 100644 --- a/ocrd_models/ocrd_models/constants.py +++ b/ocrd_models/ocrd_models/constants.py @@ -17,6 +17,7 @@ 'TAG_METS_FLOCAT', 'TAG_METS_METSHDR', 'TAG_METS_NAME', + 'TAG_METS_NOTE', 'TAG_METS_STRUCTMAP', 'TAG_MODS_IDENTIFIER', 'TAG_PAGE_ALTERNATIVEIMAGE', @@ -39,6 +40,7 @@ 'xlink': "http://www.w3.org/1999/xlink", 'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15", 'xsl': 'http://www.w3.org/1999/XSL/Transform#', + 'ocrd': 'https://ocr-d.de', } # pylint: disable=bad-whitespace @@ -51,6 +53,7 @@ TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets'] TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets'] TAG_METS_NAME = '{%s}name' % NAMESPACES['mets'] +TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets'] TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets'] TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods'] diff --git a/ocrd_models/ocrd_models/ocrd_agent.py b/ocrd_models/ocrd_models/ocrd_agent.py index f189a7cdf5..456ebeafd0 100644 --- a/ocrd_models/ocrd_models/ocrd_agent.py +++ b/ocrd_models/ocrd_models/ocrd_agent.py @@ -2,7 +2,7 @@ API to ``mets:agent`` """ # import os -from .constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME +from .constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME, TAG_METS_NOTE from .ocrd_xml_base import ET class OcrdAgent(): @@ -20,7 +20,8 @@ class OcrdAgent(): # # version = name_parts[1][1:] # v0.0.1 => 0.0.1 # return OcrdAgent(el, name, role, _type, otherrole) - def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None): + def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None, + notes=None): """ Args: el (LxmlElement): @@ -29,6 +30,7 @@ def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, ot othertype (string): role (string): otherrole (string): + notes (dict): """ if el is None: el = ET.Element(TAG_METS_AGENT) @@ -38,6 +40,7 @@ def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, ot self.othertype = othertype self.role = role self.otherrole = otherrole + self.notes = notes def __str__(self): """ @@ -105,7 +108,7 @@ def otherrole(self): @otherrole.setter def otherrole(self, otherrole): """ - Get the ``OTHERROLE`` attribute value. + Set the ``OTHERROLE`` attribute value. """ if otherrole is not None: self._el.set('ROLE', 'OTHER') @@ -116,17 +119,44 @@ def name(self): """ Get the ``mets:name`` element value. """ - el_name = self._el.find('mets:name', NS) + el_name = self._el.find(TAG_METS_NAME) if el_name is not None: return el_name.text @name.setter def name(self, name): """ - Get the ``mets:name`` element value. + Set the ``mets:name`` element value. """ if name is not None: - el_name = self._el.find('mets:name', NS) + el_name = self._el.find(TAG_METS_NAME) if el_name is None: el_name = ET.SubElement(self._el, TAG_METS_NAME) el_name.text = name + + @property + def notes(self): + """ + Get the ``mets:note`` element values (as tuples of attributes and text). + """ + el_notes = self._el.findall(TAG_METS_NOTE) + if el_notes is not None: + return [(note.attrib, note.text) + for note in el_notes] + + @notes.setter + def notes(self, notes): + """ + Set the ``mets:note`` element values. + """ + el_notes = self._el.findall(TAG_METS_NOTE) + if el_notes: + for el_note in el_notes: + self._el.remove(el_note) + if notes: + for note in notes: + el_note = ET.SubElement(self._el, TAG_METS_NOTE, nsmap={'ocrd': NS['ocrd']}) + attrib, text = note + el_note.text = text + for name, value in attrib.items(): + el_note.set('{%s}' % NS["ocrd"] + name, value) diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index a3fdd0fa81..b398d6286d 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.27.0', + version='2.28.0', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index e8cdb11d34..5cc0a147d2 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -8,6 +8,8 @@ PageType, TextRegionType, TextLineType, + TextEquivType, + TextStyleType, OrderedGroupIndexedType, UnorderedGroupIndexedType, ReadingOrderType, @@ -341,5 +343,33 @@ def test_id(self): assert pcgts.id == 'PAGE_0017_PAGE' assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + def test_style_inheritance_from_scratch(self): + """https://github.com/OCR-D/core/pull/700""" + pcgts = PcGtsType(pcGtsId="foo") + page = PageType() + region = TextRegionType() + textstyle = TextStyleType(fontFamily='Schwabacher') + textline = TextLineType(TextEquiv=[TextEquivType(Unicode='foo')]) + + pcgts.set_Page(page) + page.add_TextRegion(region) + region.add_TextLine(textline) + + assert not textline.get_TextStyle() + assert not region.get_TextStyle() + + region.set_TextStyle(textstyle) + + # TODO doesn't work that way + # assert region.get_TextStyle() == textstyle + # assert textline.get_TextStyle() == textstyle + + pcgts = parseString(to_xml(pcgts, skip_declaration=True)) + region = pcgts.get_Page().get_TextRegion()[0] + textline = region.get_TextLine()[0] + assert region.get_TextStyle().get_fontFamily() == 'Schwabacher' + assert textline.get_TextStyle().get_fontFamily() == 'Schwabacher' + + if __name__ == '__main__': main(__file__)