Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [2.28.0] - 2021-11-30

Added:

* Store parameterization of processors in METS for provenance, #747
* `ocrd workspace find --download`: Add a `--wait` option to wait between downloads, #745
* bashlib: Check fileGrps when parsing CLI args, #743, OCR-D/ocrd_olena#76
* Dockerfile: Install `time` to have `/usr/bin/time` in the image, #748, OCR-D/ocrd_all#271

Fixed:

* `ocrd-dummy`: Also set pcGtsId, v0.0.2, #739

## [2.27.0] - 2021-11-09

Fixed:
Expand Down Expand Up @@ -1381,6 +1394,7 @@ Fixed
Initial Release

<!-- link-labels -->
[2.28.0]: ../../compare/v2.28.0..v2.27.0
[2.27.0]: ../../compare/v2.27.0..v2.26.1
[2.26.1]: ../../compare/v2.26.1..v2.26.0
[2.26.0]: ../../compare/v2.26.0..v2.25.1
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ RUN apt-get update && apt-get -y install --no-install-recommends \
python3-pip \
make \
wget \
time \
curl \
sudo \
git \
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pip install ocrd
pip install ocrd_modelfactory
```

All python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.5 or higher.
All python software released by [OCR-D](https://github.com/OCR-D) requires Python 3.6 or higher.

## Command line tools

Expand Down Expand Up @@ -118,5 +118,6 @@ Test with local files: `make test`

## See Also

- [OCR-D Specifications](https://ocr-d.github.io) ([Repo](https://github.com/ocr-d/spec))
- [OCR-D Documentation](https://ocr-d.github.io/docs) ([Repo](https://github.com/ocr-d/docs))
- [OCR-D Specifications](https://https://ocr-d.de/en/spec/) ([Repo](https://github.com/ocr-d/spec))
- [OCR-D core API documentation](https://ocr-d.de/core) (built here via `make docs`)
- [OCR-D Website](https://ocr-d.de) ([Repo](https://github.com/ocr-d/ocrd-website))
1 change: 1 addition & 0 deletions ocrd/bashlib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ help:
# Build the lib
lib: lib.bash

.DELETE_ON_ERROR:
lib.bash: $(wildcard src/*.bash)
shinclude -c pound src/lib.bash > "$@"

Expand Down
12 changes: 12 additions & 0 deletions ocrd/bashlib/src/parse_argv.bash
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@ ocrd__parse_argv () {
ocrd__raise "Provide --output-file-grp/-O explicitly!"
fi

# check fileGrps
local _valopts=( --workspace "${ocrd__argv[working_dir]}" )
if [[ ${ocrd__argv[overwrite]} = true ]]; then
_valopts+=( --overwrite )
fi
if [[ -n "${ocrd__argv[page_id]:-}" ]]; then
_valopts+=( --page-id "${ocrd__argv[page_id]}" )
fi
_valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]}" )
ocrd validate tasks "${_valopts[@]}" || exit $?

# check parameters
local params_parsed retval
params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || {
retval=$?
Expand Down
8 changes: 5 additions & 3 deletions ocrd/ocrd/cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,16 @@ def validate_page(page, **kwargs):
# _inform_of_result(WorkspaceValidator.validate(Resolver(), mets_url, **kwargs))

@validate_cli.command('tasks')
@click.option('--workspace', nargs=1, required=False, help='Workspace these tasks are to be run. If omitted, only validate syntax')
@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax')
@click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.')
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.argument('tasks', nargs=-1, required=True)
def validate_process(tasks, workspace):
def validate_process(tasks, workspace, overwrite, page_id):
'''
Validate a sequence of tasks passable to 'ocrd process'
'''
if workspace:
_inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace)))
_inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace), page_id=page_id, overwrite=overwrite))
else:
for t in [ProcessorTask.parse(t) for t in tasks]:
_inform_of_result(t.validate())
6 changes: 5 additions & 1 deletion ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys
from glob import glob # XXX pathlib.Path.glob does not support absolute globs
import re
import time

import click

Expand Down Expand Up @@ -347,8 +348,9 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
'local_filename',
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@pass_workspace
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, wait):
"""
Find files.

Expand All @@ -367,6 +369,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
if wait:
time.sleep(wait)
ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
for field in output_field])
if modified_mets:
Expand Down
12 changes: 12 additions & 0 deletions ocrd/ocrd/lib.bash
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,18 @@ ocrd__parse_argv () {
ocrd__raise "Provide --output-file-grp/-O explicitly!"
fi

# check fileGrps
local _valopts=( --workspace "${ocrd__argv[working_dir]}" )
if [[ ${ocrd__argv[overwrite]} = true ]]; then
_valopts+=( --overwrite )
fi
if [[ -n "${ocrd__argv[page_id]:-}" ]]; then
_valopts+=( --page-id "${ocrd__argv[page_id]}" )
fi
_valopts+=( "${OCRD_TOOL_NAME#ocrd-} -I ${ocrd__argv[input_file_grp]} -O ${ocrd__argv[output_file_grp]}" )
ocrd validate tasks "${_valopts[@]}" || exit $?

# check parameters
local params_parsed retval
params_parsed="$(ocrd ocrd-tool "$OCRD_TOOL_JSON" tool $OCRD_TOOL_NAME parse-params "${__parameters[@]}" "${__parameter_overrides[@]}")" || {
retval=$?
Expand Down
5 changes: 3 additions & 2 deletions ocrd/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

class DummyProcessor(Processor):
"""
Bare-bones processor that copies mets:file from input group to output group.
Bare-bones processor that only copies mets:file from input group to output group.
"""

def process(self):
Expand All @@ -34,6 +34,7 @@ def process(self):
ext = MIME_TO_EXT.get(input_file.mimetype, '')
local_filename = join(self.output_file_grp, file_id + ext)
pcgts = page_from_file(self.workspace.download_file(input_file))
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
if input_file.mimetype == MIMETYPE_PAGE:
Expand Down Expand Up @@ -75,7 +76,7 @@ def process(self):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL
kwargs['version'] = '0.0.1'
kwargs['version'] = '0.0.2'
super(DummyProcessor, self).__init__(*args, **kwargs)

@click.command()
Expand Down
15 changes: 10 additions & 5 deletions ocrd/ocrd/processor/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,25 @@ def run_processor(
processor.process()
t1_wall = perf_counter() - t0_wall
t1_cpu = process_time() - t0_cpu
logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % (
logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % (
ocrd_tool['executable'],
t1_wall,
t1_cpu,
input_file_grp if input_file_grp else '',
output_file_grp if output_file_grp else '',
json.dumps(parameter) if parameter else {}
input_file_grp or '',
output_file_grp or '',
json.dumps(parameter) or '',
page_id or ''
))
workspace.mets.add_agent(
name=name,
_type='OTHER',
othertype='SOFTWARE',
role='OTHER',
otherrole=otherrole
otherrole=otherrole,
notes=[({'option': 'input-file-grp'}, input_file_grp or ''),
({'option': 'output-file-grp'}, output_file_grp or ''),
({'option': 'parameter'}, json.dumps(parameter or '')),
({'option': 'page-id'}, page_id or '')]
)
workspace.save_mets()
return processor
Expand Down
3 changes: 3 additions & 0 deletions ocrd_models/ocrd_models/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
'TAG_METS_FLOCAT',
'TAG_METS_METSHDR',
'TAG_METS_NAME',
'TAG_METS_NOTE',
'TAG_METS_STRUCTMAP',
'TAG_MODS_IDENTIFIER',
'TAG_PAGE_ALTERNATIVEIMAGE',
Expand All @@ -39,6 +40,7 @@
'xlink': "http://www.w3.org/1999/xlink",
'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
'xsl': 'http://www.w3.org/1999/XSL/Transform#',
'ocrd': 'https://ocr-d.de',
}

# pylint: disable=bad-whitespace
Expand All @@ -51,6 +53,7 @@
TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets']
TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets']
TAG_METS_NAME = '{%s}name' % NAMESPACES['mets']
TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets']
TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets']

TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods']
Expand Down
42 changes: 36 additions & 6 deletions ocrd_models/ocrd_models/ocrd_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
API to ``mets:agent``
"""
# import os
from .constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME
from .constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME, TAG_METS_NOTE
from .ocrd_xml_base import ET

class OcrdAgent():
Expand All @@ -20,7 +20,8 @@ class OcrdAgent():
# # version = name_parts[1][1:] # v0.0.1 => 0.0.1
# return OcrdAgent(el, name, role, _type, otherrole)

def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None):
def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, otherrole=None,
notes=None):
"""
Args:
el (LxmlElement):
Expand All @@ -29,6 +30,7 @@ def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, ot
othertype (string):
role (string):
otherrole (string):
notes (dict):
"""
if el is None:
el = ET.Element(TAG_METS_AGENT)
Expand All @@ -38,6 +40,7 @@ def __init__(self, el=None, name=None, _type=None, othertype=None, role=None, ot
self.othertype = othertype
self.role = role
self.otherrole = otherrole
self.notes = notes

def __str__(self):
"""
Expand Down Expand Up @@ -105,7 +108,7 @@ def otherrole(self):
@otherrole.setter
def otherrole(self, otherrole):
"""
Get the ``OTHERROLE`` attribute value.
Set the ``OTHERROLE`` attribute value.
"""
if otherrole is not None:
self._el.set('ROLE', 'OTHER')
Expand All @@ -116,17 +119,44 @@ def name(self):
"""
Get the ``mets:name`` element value.
"""
el_name = self._el.find('mets:name', NS)
el_name = self._el.find(TAG_METS_NAME)
if el_name is not None:
return el_name.text

@name.setter
def name(self, name):
"""
Get the ``mets:name`` element value.
Set the ``mets:name`` element value.
"""
if name is not None:
el_name = self._el.find('mets:name', NS)
el_name = self._el.find(TAG_METS_NAME)
if el_name is None:
el_name = ET.SubElement(self._el, TAG_METS_NAME)
el_name.text = name

@property
def notes(self):
"""
Get the ``mets:note`` element values (as tuples of attributes and text).
"""
el_notes = self._el.findall(TAG_METS_NOTE)
if el_notes is not None:
return [(note.attrib, note.text)
for note in el_notes]

@notes.setter
def notes(self, notes):
"""
Set the ``mets:note`` element values.
"""
el_notes = self._el.findall(TAG_METS_NOTE)
if el_notes:
for el_note in el_notes:
self._el.remove(el_note)
if notes:
for note in notes:
el_note = ET.SubElement(self._el, TAG_METS_NOTE, nsmap={'ocrd': NS['ocrd']})
attrib, text = note
el_note.text = text
for name, value in attrib.items():
el_note.set('{%s}' % NS["ocrd"] + name, value)
2 changes: 1 addition & 1 deletion ocrd_utils/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='ocrd_utils',
version='2.27.0',
version='2.28.0',
description='OCR-D framework - shared code, helpers, constants',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand Down
30 changes: 30 additions & 0 deletions tests/model/test_ocrd_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
PageType,
TextRegionType,
TextLineType,
TextEquivType,
TextStyleType,
OrderedGroupIndexedType,
UnorderedGroupIndexedType,
ReadingOrderType,
Expand Down Expand Up @@ -341,5 +343,33 @@ def test_id(self):
assert pcgts.id == 'PAGE_0017_PAGE'
assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'

def test_style_inheritance_from_scratch(self):
"""https://github.com/OCR-D/core/pull/700"""
pcgts = PcGtsType(pcGtsId="foo")
page = PageType()
region = TextRegionType()
textstyle = TextStyleType(fontFamily='Schwabacher')
textline = TextLineType(TextEquiv=[TextEquivType(Unicode='foo')])

pcgts.set_Page(page)
page.add_TextRegion(region)
region.add_TextLine(textline)

assert not textline.get_TextStyle()
assert not region.get_TextStyle()

region.set_TextStyle(textstyle)

# TODO doesn't work that way
# assert region.get_TextStyle() == textstyle
# assert textline.get_TextStyle() == textstyle

pcgts = parseString(to_xml(pcgts, skip_declaration=True))
region = pcgts.get_Page().get_TextRegion()[0]
textline = region.get_TextLine()[0]
assert region.get_TextStyle().get_fontFamily() == 'Schwabacher'
assert textline.get_TextStyle().get_fontFamily() == 'Schwabacher'


if __name__ == '__main__':
main(__file__)