From 6dd2593812a2730a85c91488a411dcdcf4bfff50 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Jul 2025 13:09:06 +0200 Subject: [PATCH 1/3] try to appease flake8 --- .github/workflows/unit-test.yml | 2 +- src/ocrd/cli/__init__.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index d9e960fde..2d127023a 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -63,4 +63,4 @@ jobs: # stop the build if there are Python syntax errors or undefined names flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics --format=github # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --format=github + flake8 src --count --exit-zero --per-file-ignores=src/ocrd/cli/__init__.py:E402 --extend-exclude=src/ocrd_models/ocrd_page_generateds.py,src/ocrd_page_user_methods/*.py --max-complexity=10 --max-line-length=127 --statistics --format=github diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 2a08d9710..8f6cd04ac 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -10,13 +10,14 @@ from ocrd_utils import config + # pylint: disable=wrong-import-position def command_with_replaced_help(*replacements): class CommandWithReplacedHelp(click.Command): def get_help(self, ctx): - newhelp : str = super().get_help(ctx) + newhelp: str = super().get_help(ctx) for replacement in replacements: newhelp = re.sub(*replacement, newhelp) # print(newhelp) @@ -24,6 +25,7 @@ def get_help(self, ctx): return CommandWithReplacedHelp + # pylint: enable=wrong-import-position from ..decorators import ocrd_loglevel From 3458159351964df2ed2ae75f05b354b5ff1c03d2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Jul 2025 13:25:23 +0200 Subject: [PATCH 2/3] more flake8 appeasements --- src/ocrd/cli/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ocrd/cli/__init__.py b/src/ocrd/cli/__init__.py index 8f6cd04ac..8f02b01e0 100644 --- a/src/ocrd/cli/__init__.py +++ b/src/ocrd/cli/__init__.py @@ -104,14 +104,16 @@ def get_help(self, ctx): {config.describe('OCRD_LOGGING_DEBUG')} """ + @click.group(epilog=_epilog) @click.version_option(package_name='ocrd') @ocrd_loglevel -def cli(**kwargs): # pylint: disable=unused-argument +def cli(**kwargs): # pylint: disable=unused-argument """ Entry-point of multi-purpose CLI for OCR-D """ + cli.add_command(ocrd_tool_cli) cli.add_command(workspace_cli) cli.add_command(process_cli) From f63c4ce56bb2622c1be108334d1cf9f15328f511 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 9 Jul 2025 23:13:48 +0200 Subject: [PATCH 3/3] more concessions to flake8 --- .github/workflows/unit-test.yml | 2 +- src/ocrd/cli/bashlib.py | 9 +- src/ocrd/cli/log.py | 9 +- src/ocrd/cli/ocrd_tool.py | 30 ++- src/ocrd/cli/process.py | 1 + src/ocrd/cli/resmgr.py | 1 - src/ocrd/cli/validate.py | 45 +++-- src/ocrd/cli/workspace.py | 177 +++++++++++++----- src/ocrd/cli/zip.py | 17 +- src/ocrd/decorators/__init__.py | 16 +- src/ocrd/decorators/loglevel_option.py | 4 + src/ocrd/decorators/mets_find_options.py | 3 +- src/ocrd/decorators/ocrd_cli_options.py | 2 +- src/ocrd/decorators/parameter_option.py | 23 +-- src/ocrd/mets_server.py | 16 +- src/ocrd/processor/base.py | 139 ++++++++------ src/ocrd/processor/builtin/dummy_processor.py | 11 +- .../processor/builtin/filter_processor.py | 5 +- src/ocrd/processor/helpers.py | 8 +- src/ocrd/processor/ocrd_page_result.py | 12 +- src/ocrd/resolver.py | 74 ++++---- src/ocrd/task_sequence.py | 15 +- src/ocrd/workspace.py | 118 ++++++------ src/ocrd/workspace_backup.py | 3 + src/ocrd/workspace_bagger.py | 23 ++- src/ocrd_modelfactory/__init__.py | 6 +- src/ocrd_models/constants.py | 19 +- src/ocrd_models/ocrd_agent.py | 2 +- src/ocrd_models/ocrd_exif.py | 10 +- src/ocrd_models/ocrd_file.py | 43 +++-- src/ocrd_models/ocrd_mets.py | 157 +++++++++------- src/ocrd_models/ocrd_page.py | 30 +-- src/ocrd_models/ocrd_xml_base.py | 1 + src/ocrd_models/report.py | 3 +- src/ocrd_models/utils.py | 7 +- src/ocrd_models/xpath_functions.py | 4 +- src/ocrd_network/cli/client.py | 5 +- src/ocrd_network/client.py | 4 +- src/ocrd_network/client_utils.py | 4 +- src/ocrd_network/constants.py | 1 + src/ocrd_network/database.py | 6 +- src/ocrd_network/processing_server.py | 7 +- src/ocrd_network/processing_worker.py | 10 +- src/ocrd_network/processor_server.py | 4 +- src/ocrd_network/rabbitmq_utils/helpers.py | 2 +- src/ocrd_network/runtime_data/deployer.py | 15 +- src/ocrd_network/runtime_data/hosts.py | 16 +- .../runtime_data/network_agents.py | 4 +- .../runtime_data/network_services.py | 2 +- src/ocrd_network/server_cache.py | 2 +- src/ocrd_network/server_utils.py | 18 +- src/ocrd_network/utils.py | 1 + src/ocrd_page_user_methods.py | 20 +- src/ocrd_utils/__init__.py | 8 +- src/ocrd_utils/config.py | 163 ++++++++-------- src/ocrd_utils/deprecate.py | 3 + src/ocrd_utils/image.py | 74 +++++--- src/ocrd_utils/introspect.py | 11 +- src/ocrd_utils/logging.py | 19 +- src/ocrd_utils/os.py | 19 +- src/ocrd_utils/str.py | 48 +++-- src/ocrd_validators/json_validator.py | 5 +- src/ocrd_validators/ocrd_tool_validator.py | 3 +- src/ocrd_validators/ocrd_zip_validator.py | 9 +- src/ocrd_validators/page_validator.py | 30 ++- src/ocrd_validators/parameter_validator.py | 5 +- .../resource_list_validator.py | 4 +- src/ocrd_validators/workspace_validator.py | 50 +++-- src/ocrd_validators/xsd_mets_validator.py | 3 +- src/ocrd_validators/xsd_page_validator.py | 3 +- src/ocrd_validators/xsd_validator.py | 6 +- 71 files changed, 1022 insertions(+), 607 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 2d127023a..ae5570cea 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -63,4 +63,4 @@ jobs: # stop the build if there are Python syntax errors or undefined names flake8 src --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics --format=github # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 src --count --exit-zero --per-file-ignores=src/ocrd/cli/__init__.py:E402 --extend-exclude=src/ocrd_models/ocrd_page_generateds.py,src/ocrd_page_user_methods/*.py --max-complexity=10 --max-line-length=127 --statistics --format=github + flake8 src --count --exit-zero --per-file-ignores=src/ocrd/cli/__init__.py:E402,src/ocrd/__init__.py:F401,src/ocrd/__init__.py:F403,src/ocrd/decorators/__init__.py:F401,src/ocrd/processor/__init__.py:F401,src/ocrd_modelfactory/__init__.py:F401,src/ocrd_models/__init__.py:F401,src/ocrd_network/__init__.py:F401,src/ocrd_utils/__init__.py:F401,src/ocrd_validators/__init__.py:F401,src/ocrd_models/constants.py:E221 --extend-exclude=src/ocrd_models/ocrd_page_generateds.py,src/ocrd_page_user_methods/*.py --max-complexity=10 --max-line-length=127 --statistics --format=github diff --git a/src/ocrd/cli/bashlib.py b/src/ocrd/cli/bashlib.py index b6817abe9..75edd8a0e 100644 --- a/src/ocrd/cli/bashlib.py +++ b/src/ocrd/cli/bashlib.py @@ -29,6 +29,7 @@ # ocrd bashlib # ---------------------------------------------------------------------- + @click.group('bashlib') def bashlib_cli(): """ @@ -39,6 +40,7 @@ def bashlib_cli(): # ocrd bashlib filename # ---------------------------------------------------------------------- + @bashlib_cli.command('filename') def bashlib_filename(): """ @@ -48,6 +50,7 @@ def bashlib_filename(): """ print(BASHLIB_FILENAME) + @bashlib_cli.command('constants') @click.argument('name') def bashlib_constants(name): @@ -72,6 +75,7 @@ def bashlib_constants(name): else: print(val) + @bashlib_cli.command('input-files') @click.option('--ocrd-tool', help="path to ocrd-tool.json of processor to feed", default=None) @click.option('--executable', help="name of processor executable in ocrd-tool.json", default=None) @@ -122,6 +126,7 @@ class FullBashlibProcessor(BashlibProcessor): def metadata_location(self): # needed for metadata loading and validation mechanism return ocrd_tool + @property def executable(self): # needed for ocrd_tool lookup @@ -137,8 +142,8 @@ def ocrd_tool(self): # required now 'input_file_grp_cardinality': 1, 'output_file_grp_cardinality': 1, - 'steps': [''] - } + 'steps': ['']} + @property def version(self): # needed to satisfy the validator and wrapper diff --git a/src/ocrd/cli/log.py b/src/ocrd/cli/log.py index 560254f79..68eca4463 100644 --- a/src/ocrd/cli/log.py +++ b/src/ocrd/cli/log.py @@ -7,7 +7,7 @@ """ import click from ocrd_utils import initLogging, getLogger, getLevelName -import logging + class LogCtx(): @@ -18,10 +18,13 @@ def log(self, lvl, *args, **kwargs): logger = getLogger(self.name) logger.log(getLevelName(lvl), *args, **kwargs) + pass_log = click.make_pass_decorator(LogCtx) + @click.group("log") -@click.option('-n', '--name', envvar='OCRD_TOOL_NAME', default='log_cli', metavar='LOGGER_NAME', help='Name of the logger', show_default=True) +@click.option('-n', '--name', envvar='OCRD_TOOL_NAME', default='log_cli', metavar='LOGGER_NAME', + help='Name of the logger', show_default=True) @click.pass_context def log_cli(ctx, name, *args, **kwargs): """ @@ -33,6 +36,7 @@ def log_cli(ctx, name, *args, **kwargs): initLogging() ctx.obj = LogCtx('ocrd.' + name) + def _bind_log_command(lvl): @click.argument('msgs', nargs=-1) @pass_log @@ -47,5 +51,6 @@ def _log_wrapper(ctx, msgs): ctx.log(lvl.upper(), msg) return _log_wrapper + for _lvl in ['trace', 'debug', 'info', 'warning', 'error', 'critical']: log_cli.command(_lvl, help="Log a %s message" % _lvl.upper())(_bind_log_command(_lvl)) diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 09c25f914..b4310e5f9 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -23,6 +23,7 @@ ) from ocrd_validators import ParameterValidator, OcrdToolValidator + class OcrdToolCtx(): def __init__(self, filename): @@ -36,25 +37,31 @@ def __init__(self, filename): class BashProcessor(Processor): @property - def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed + def metadata(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.json + @property - def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed + def executable(inner_self): # pylint: disable=no-self-argument,arguments-renamed return self.tool_name + @property - def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed + def moduledir(inner_self): # pylint: disable=no-self-argument,arguments-renamed return os.path.dirname(self.filename) + # set docstrings to empty __doc__ = None # HACK: override the module-level docstring, too getmodule(OcrdToolCtx).__doc__ = None - def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed + + def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed return super() self.processor = BashProcessor + pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) + # ---------------------------------------------------------------------- # ocrd ocrd-tool # ---------------------------------------------------------------------- @@ -65,6 +72,7 @@ def process(inner_self): # pylint: disable=no-self-argument,arguments-renamed def ocrd_tool_cli(ctx, json_file): ctx.obj = OcrdToolCtx(json_file) + # ---------------------------------------------------------------------- # ocrd ocrd-tool version # ---------------------------------------------------------------------- @@ -74,6 +82,7 @@ def ocrd_tool_cli(ctx, json_file): def ocrd_tool_version(ctx): print(ctx.json['version']) + # ---------------------------------------------------------------------- # ocrd ocrd-tool validate # ---------------------------------------------------------------------- @@ -86,6 +95,7 @@ def ocrd_tool_validate(ctx): if not report.is_valid: return 128 + # ---------------------------------------------------------------------- # ocrd ocrd-tool list-tools # ---------------------------------------------------------------------- @@ -96,6 +106,7 @@ def ocrd_tool_list(ctx): for tool in ctx.json['tools']: print(tool) + # ---------------------------------------------------------------------- # ocrd ocrd-tool dump-tools # ---------------------------------------------------------------------- @@ -105,6 +116,7 @@ def ocrd_tool_list(ctx): def ocrd_tool_dump(ctx): print(dumps(ctx.json['tools'], indent=True)) + @ocrd_tool_cli.command('dump-module-dirs', help="Dump module directory of each tool") @pass_ocrd_tool def ocrd_tool_dump_module_dirs(ctx): @@ -112,6 +124,7 @@ def ocrd_tool_dump_module_dirs(ctx): for tool_name in ctx.json['tools']}, indent=True)) + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool # ---------------------------------------------------------------------- @@ -124,6 +137,7 @@ def ocrd_tool_tool(ctx, tool_name): raise Exception("No such tool: %s" % tool_name) ctx.tool_name = tool_name + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool description # ---------------------------------------------------------------------- @@ -133,29 +147,34 @@ def ocrd_tool_tool(ctx, tool_name): def ocrd_tool_tool_description(ctx): print(ctx.json['tools'][ctx.tool_name]['description']) + @ocrd_tool_tool.command('list-resources', help="List tool's file resources") @pass_ocrd_tool def ocrd_tool_tool_list_resources(ctx): ctx.processor(None).list_resources() + @ocrd_tool_tool.command('resolve-resource', help="Get a tool's file resource full path name") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_resolve_resource(ctx, res_name): print(ctx.processor(None).resolve_resource(res_name)) + @ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource") @click.argument('res_name') @pass_ocrd_tool def ocrd_tool_tool_show_resource(ctx, res_name): ctx.processor(None).show_resource(res_name) + @ocrd_tool_tool.command('help', help="Generate help for processors") @click.argument('subcommand', required=False) @pass_ocrd_tool def ocrd_tool_tool_params_help(ctx, subcommand): ctx.processor(None).show_help(subcommand=subcommand) + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool categories # ---------------------------------------------------------------------- @@ -165,6 +184,7 @@ def ocrd_tool_tool_params_help(ctx, subcommand): def ocrd_tool_tool_categories(ctx): print('\n'.join(ctx.json['tools'][ctx.tool_name]['categories'])) + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool steps # ---------------------------------------------------------------------- @@ -174,6 +194,7 @@ def ocrd_tool_tool_categories(ctx): def ocrd_tool_tool_steps(ctx): print('\n'.join(ctx.json['tools'][ctx.tool_name]['steps'])) + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool dump # ---------------------------------------------------------------------- @@ -183,6 +204,7 @@ def ocrd_tool_tool_steps(ctx): def ocrd_tool_tool_dump(ctx): print(dumps(ctx.json['tools'][ctx.tool_name], indent=True)) + # ---------------------------------------------------------------------- # ocrd ocrd-tool tool parse-params # ---------------------------------------------------------------------- diff --git a/src/ocrd/cli/process.py b/src/ocrd/cli/process.py index 9dcd56264..4674a78f5 100644 --- a/src/ocrd/cli/process.py +++ b/src/ocrd/cli/process.py @@ -13,6 +13,7 @@ from ..decorators import ocrd_loglevel + # ---------------------------------------------------------------------- # ocrd process # ---------------------------------------------------------------------- diff --git a/src/ocrd/cli/resmgr.py b/src/ocrd/cli/resmgr.py index 01b42b5e1..34f6b9191 100644 --- a/src/ocrd/cli/resmgr.py +++ b/src/ocrd/cli/resmgr.py @@ -18,7 +18,6 @@ getLogger, get_moduledir, get_ocrd_tool_json, - resource_filename, initLogging, RESOURCE_LOCATIONS, ) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index a1ec8fafd..ab07096f3 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -17,12 +17,13 @@ from ocrd_utils import initLogging, parse_json_string_or_file, DEFAULT_METS_BASENAME from ocrd_validators import ( OcrdToolValidator, - OcrdZipValidator, + # OcrdZipValidator, PageValidator, ParameterValidator, - WorkspaceValidator, + # WorkspaceValidator, ) + def _inform_of_result(report): if not report.is_valid: print(report.to_xml()) @@ -36,6 +37,7 @@ def validate_cli(): """ initLogging() + @validate_cli.command('tool-json') @click.argument('ocrd_tool', required=False, nargs=1) def validate_ocrd_tool(ocrd_tool): @@ -48,6 +50,7 @@ def validate_ocrd_tool(ocrd_tool): ocrd_tool = loads(f.read()) _inform_of_result(OcrdToolValidator.validate(ocrd_tool)) + @validate_cli.command('parameters') @click.argument('ocrd_tool') @click.argument('executable') @@ -60,24 +63,31 @@ def validate_parameters(ocrd_tool, executable, param_json): ocrd_tool = loads(f.read()) _inform_of_result(ParameterValidator(ocrd_tool['tools'][executable]).validate(parse_json_string_or_file(param_json))) + @validate_cli.command('page') @click.argument('page', required=True, nargs=1) -@click.option('--page-textequiv-consistency', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') -@click.option('--page-textequiv-strategy', help="Strategy to determine the correct textequiv", type=click.Choice(['first']), default='first') +@click.option('--page-textequiv-consistency', help="How strict to check PAGE multi-level textequiv consistency", + type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') +@click.option('--page-textequiv-strategy', help="Strategy to determine the correct textequiv", + type=click.Choice(['first']), default='first') @click.option('--check-baseline', help="Whether Baseline must be fully within TextLine/Coords", is_flag=True, default=False) -@click.option('--check-coords', help="Whether *Region/TextLine/Word/Glyph must each be fully contained within Border/*Region/TextLine/Word, resp.", is_flag=True, default=False) +@click.option('--check-coords', help="Whether *Region/TextLine/Word/Glyph must each be fully contained " + "within Border/*Region/TextLine/Word, resp.", is_flag=True, default=False) def validate_page(page, **kwargs): ''' Validate PAGE against OCR-D conventions ''' _inform_of_result(PageValidator.validate(filename=page, **kwargs)) + # @validate_cli.command('zip') # @click.argument('src', type=click.Path(dir_okay=True, readable=True, resolve_path=True), required=True) # @click.option('-Z', '--skip-unzip', help="Treat SRC as a directory not a ZIP", is_flag=True, default=False) # @click.option('-B', '--skip-bag', help="Whether to skip all checks of manifests and files", is_flag=True, default=False) -# @click.option('-C', '--skip-checksums', help="Whether to omit checksum checks but still check basic BagIt conformance", is_flag=True, default=False) -# @click.option('-D', '--skip-delete', help="Whether to skip deleting the unpacked OCRD-ZIP dir after valdiation", is_flag=True, default=False) +# @click.option('-C', '--skip-checksums', help="Whether to omit checksum checks but still check basic BagIt conformance", +# is_flag=True, default=False) +# @click.option('-D', '--skip-delete', help="Whether to skip deleting the unpacked OCRD-ZIP dir after valdiation", +# is_flag=True, default=False) # @click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1) # def validate(src, **kwargs): # """ @@ -87,11 +97,16 @@ def validate_page(page, **kwargs): # """ # _inform_of_result(OcrdZipValidator(Resolver(), src).validate(**kwargs)) + # @validate_cli.command('workspace') # @click.option('-a', '--download', is_flag=True, help="Download all files") -# @click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['imagefilename', 'dimension', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url'])) -# @click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') -# @click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly') +# @click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice( +# ['imagefilename', 'dimension', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', +# 'pixel_density', 'page', 'url'])) +# @click.option('--page-textequiv-consistency', '--page-strictness', type=click.Choice(['strict', 'lax', 'fix', 'off']), +# help="How strict to check PAGE multi-level textequiv consistency", default='strict') +# @click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", +# type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly') # @click.argument('mets_url') # def validate_workspace(mets_url, **kwargs): # ''' @@ -99,11 +114,15 @@ def validate_page(page, **kwargs): # ''' # _inform_of_result(WorkspaceValidator.validate(Resolver(), mets_url, **kwargs)) + @validate_cli.command('tasks') -@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') -@click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. ' + 'If omitted, only validate syntax') +@click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, + help='Basename of the METS file, used in conjunction with --workspace') @click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') -@click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') +@click.option('--overwrite', is_flag=True, default=False, + help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index c8aea0871..fc9a00afa 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -5,7 +5,6 @@ :prog: ocrd workspace :nested: full """ -import os from os import rmdir, unlink from os.path import dirname, relpath, normpath, exists, join, isabs, isdir from pathlib import Path @@ -19,7 +18,16 @@ from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager from ocrd.mets_server import OcrdMetsServer -from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list, DEFAULT_METS_BASENAME +from ocrd_utils import ( + getLogger, + initLogging, + pushd_popd, + EXT_TO_MIME, + safe_filename, + parse_json_string_or_file, + partition_list, + DEFAULT_METS_BASENAME, +) from ocrd.decorators import mets_find_options from . import command_with_replaced_help from ocrd_models.constants import METS_PAGE_DIV_ATTRIBUTE @@ -32,8 +40,8 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met if mets_basename: self.log.warning(DeprecationWarning('--mets-basename is deprecated. Use --mets/--directory instead.')) self.resolver = Resolver() - self.directory, self.mets_url, self.mets_basename, self.mets_server_url \ - = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) + self.directory, self.mets_url, self.mets_basename, self.mets_server_url = \ + self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup def workspace(self): @@ -44,20 +52,24 @@ def workspace(self): automatic_backup=self.automatic_backup, mets_server_url=self.mets_server_url, ) + def backup_manager(self): return WorkspaceBackupManager(self.workspace()) pass_workspace = click.make_pass_decorator(WorkspaceCtx) + # ---------------------------------------------------------------------- # ocrd workspace # ---------------------------------------------------------------------- @click.group("workspace") -@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', help='Changes the workspace folder location [default: METS_URL directory or .]"') +@click.option('-d', '--directory', envvar='WORKSPACE_DIR', type=click.Path(file_okay=False), metavar='WORKSPACE_DIR', + help='Changes the workspace folder location [default: METS_URL directory or .]"') @click.option('-M', '--mets-basename', default=None, help='METS file basename. Deprecated, use --mets/--directory') -@click.option('-m', '--mets', default=None, help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]', metavar="METS_URL") +@click.option('-m', '--mets', default=None, metavar="METS_URL", + help='The path/URL of the METS file [default: WORKSPACE_DIR/mets.xml]') @click.option('-U', '--mets-server-url', 'mets_server_url', help="TCP host URI or UDS path of METS server") @click.option('--backup', default=False, help="Backup mets.xml whenever it is saved.", is_flag=True) @click.pass_context @@ -67,7 +79,7 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): A workspace comprises a METS file and a directory as point of reference. - Operates on the file system directly or via a METS server + Operates on the file system directly or via a METS server (already running via some prior `server start` subcommand). """ initLogging() @@ -79,6 +91,7 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): automatic_backup=backup ) + # ---------------------------------------------------------------------- # ocrd workspace validate # ---------------------------------------------------------------------- @@ -88,10 +101,12 @@ def workspace_cli(ctx, directory, mets, mets_basename, mets_server_url, backup): @pass_workspace @click.option('-a', '--download', is_flag=True, help="Download all files") @click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice( - ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density', 'page', 'page_xsd', - 'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd'])) -@click.option('--page-textequiv-consistency', '--page-strictness', help="How strict to check PAGE multi-level textequiv consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') -@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly') + ['imagefilename', 'alternativeimage_filename', 'alternativeimage_comments', 'dimension', 'pixel_density', + 'page', 'page_xsd', 'url', 'mets_fileid_page_pcgtsid', 'mets_unique_identifier', 'mets_files', 'mets_xsd'])) +@click.option('--page-textequiv-consistency', '--page-strictness', type=click.Choice(['strict', 'lax', 'fix', 'off']), + default='strict', help="How strict to check PAGE multi-level textequiv consistency") +@click.option('--page-coordinate-consistency', help="How fierce to check PAGE multi-level coordinate consistency", + type=click.Choice(['poly', 'baseline', 'both', 'off']), default='poly') @click.argument('mets_url', default=None, required=False) def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency): """ @@ -105,7 +120,8 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency """ LOG = getLogger('ocrd.cli.workspace.validate') if mets_url: - LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of argument 'METS_URL' ('%s')" % mets_url)) + LOG.warning(DeprecationWarning("Use 'ocrd workspace --mets METS init' instead of " + "argument 'METS_URL' ('%s')" % mets_url)) else: mets_url = ctx.mets_url report = WorkspaceValidator.validate( @@ -121,6 +137,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency if not report.is_valid: sys.exit(128) + # ---------------------------------------------------------------------- # ocrd workspace clone # ---------------------------------------------------------------------- @@ -128,13 +145,15 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local FLocat " + "path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @click.argument('workspace_dir', default=None, required=False) @pass_workspace -def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir): +def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, + include_fileGrp, exclude_fileGrp, mets_url, workspace_dir): """ Create a workspace from METS_URL and return the directory @@ -146,7 +165,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: - LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) + LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of " + "argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir assert not ctx.mets_server_url, \ @@ -167,6 +187,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim workspace.save_mets() print(workspace.directory) + # ---------------------------------------------------------------------- # ocrd workspace init # ---------------------------------------------------------------------- @@ -184,7 +205,8 @@ def workspace_init(ctx, clobber_mets, directory): """ LOG = getLogger('ocrd.cli.workspace.init') if directory: - LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) + LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of " + "argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory assert not ctx.mets_server_url, \ f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" @@ -196,6 +218,7 @@ def workspace_init(ctx, clobber_mets, directory): workspace.save_mets() print(workspace.directory) + # ---------------------------------------------------------------------- # ocrd workspace add # ---------------------------------------------------------------------- @@ -203,11 +226,13 @@ def workspace_init(ctx, clobber_mets, directory): @workspace_cli.command('add') @click.option('-G', '--file-grp', help="fileGrp USE", required=True, metavar='FILE_GRP') @click.option('-i', '--file-id', help="ID for the file", required=True, metavar='FILE_ID') -@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", required=False, metavar='TYPE') +@click.option('-m', '--mimetype', help="Media type of the file. Guessed from extension if not provided", + required=False, metavar='TYPE') @click.option('-g', '--page-id', help="ID of the physical page", metavar='PAGE_ID') @click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False) @click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True) -@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True) +@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", + default=False, is_flag=True) @click.argument('fname', required=True) @pass_workspace def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname): @@ -223,7 +248,8 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ mimetype = EXT_TO_MIME[Path(fname).suffix] log.info("Guessed mimetype to be %s" % mimetype) except KeyError: - log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) + log.error("Cannot guess mimetype from extension '%s' for '%s'. " + "Set --mimetype explicitly" % (Path(fname).suffix, fname)) log.debug("Adding '%s'", fname) local_filename = None @@ -260,27 +286,34 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ workspace.add_file(file_grp, **kwargs) workspace.save_mets() + # ---------------------------------------------------------------------- # ocrd workspace bulk-add # ---------------------------------------------------------------------- # pylint: disable=broad-except @workspace_cli.command('bulk-add') -@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True) +@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths " + "to define named captures usable in the other parameters", required=True) @click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False) @click.option('-g', '--page-id', help="physical page ID of the file", required=False) @click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False) @click.option('-u', '--url', help="Remote URL of the file", required=False) -@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory (copied from source file if different)", required=False) +@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory " + "(copied from source file if different)", required=False) @click.option('-G', '--file-grp', help="File group USE of the file", required=True) -@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True) -@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", required=False) +@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", + default=False, is_flag=True) +@click.option('-S', '--source-path', 'src_path_option', help="File path to copy from (if different from FILE_GLOB values)", + required=False) @click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True) -@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True) +@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", + default=False, is_flag=True) @click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True) @click.argument('file_glob', nargs=-1, required=True) @pass_workspace -def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip): +def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run, + file_glob, src_path_option, ignore, force, skip): """ Add files in bulk to an OCR-D workspace. @@ -321,7 +354,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - """ - log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name + log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = ctx.workspace() try: @@ -355,7 +388,12 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi group_dict = m.groupdict() # set up file info - file_dict = {'local_filename': local_filename, 'url': url, 'mimetype': mimetype, 'file_id': file_id, 'page_id': page_id, 'file_grp': file_grp} + file_dict = {'local_filename': local_filename, + 'url': url, + 'mimetype': mimetype, + 'file_id': file_id, + 'page_id': page_id, + 'file_grp': file_grp} # Flag to track whether 'local_filename' should be 'src' local_filename_is_src = False @@ -394,7 +432,8 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi try: file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix] except KeyError: - log.error("Cannot guess MIME type from extension '%s' for '%s'. Set --mimetype explicitly" % (srcpath.suffix, srcpath)) + log.error("Cannot guess MIME type from extension '%s' for '%s'. " + "Set --mimetype explicitly" % (srcpath.suffix, srcpath)) # copy files if src != url if local_filename_is_src: @@ -413,7 +452,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() @@ -447,7 +486,8 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi @click.option('--keep-files', is_flag=True, help="Do not remove downloaded files from the workspace with --undo-download") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace -def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait): +def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, + include_fileGrp, exclude_fileGrp, download, undo_download, keep_files, wait): """ Find files. @@ -467,7 +507,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl page_id=page_id, include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp, - ): + ): if download and not f.local_filename: workspace.download_file(f) modified_mets = True @@ -492,13 +532,15 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl for fields in ret: print('\t'.join(fields)) + # ---------------------------------------------------------------------- # ocrd workspace remove # ---------------------------------------------------------------------- @workspace_cli.command('remove') @click.option('-k', '--keep-file', help="Do not delete file from file system", default=False, is_flag=True) -@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", default=False, is_flag=True) +@click.option('-f', '--force', help="Continue even if mets:file or file on file system does not exist", + default=False, is_flag=True) @click.argument('ID', nargs=-1) @pass_workspace def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin @@ -534,13 +576,16 @@ def rename_group(ctx, old, new): workspace.rename_file_group(old, new) workspace.save_mets() + # ---------------------------------------------------------------------- # ocrd workspace remove-group # ---------------------------------------------------------------------- @workspace_cli.command('remove-group') -@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", default=False, is_flag=True) -@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", default=False, is_flag=True) +@click.option('-r', '--recursive', help="Delete any files in the group before the group itself", + default=False, is_flag=True) +@click.option('-f', '--force', help="Continue removing even if group or containing files not found in METS", + default=False, is_flag=True) @click.option('-k', '--keep-files', help="Do not delete files from file system", default=False, is_flag=True) @click.argument('GROUP', nargs=-1) @pass_workspace @@ -558,6 +603,7 @@ def remove_group(ctx, group, recursive, force, keep_files): workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() + # ---------------------------------------------------------------------- # ocrd workspace prune-files # ---------------------------------------------------------------------- @@ -590,16 +636,19 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): workspace.mets.remove_file(f.ID) except Exception as e: ctx.log.exception("Error removing %f: %s", f, e) - raise(e) + raise e workspace.save_mets() + # ---------------------------------------------------------------------- # ocrd workspace clean # ---------------------------------------------------------------------- @workspace_cli.command('clean') -@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True) -@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", default=False, is_flag=True) +@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", + default=False, is_flag=True) +@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", + default=False, is_flag=True) @click.argument('path_glob', nargs=-1, required=False) @pass_workspace def clean(ctx, dry_run, directories, path_glob): @@ -646,6 +695,7 @@ def clean(ctx, dry_run, directories, path_glob): else: rmdir(path) + # ---------------------------------------------------------------------- # ocrd workspace list-group # ---------------------------------------------------------------------- @@ -659,6 +709,7 @@ def list_groups(ctx): workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) + # ---------------------------------------------------------------------- # ocrd workspace list-page # ---------------------------------------------------------------------- @@ -669,11 +720,15 @@ def list_groups(ctx): show_default=True, multiple=True, type=click.Choice(METS_PAGE_DIV_ATTRIBUTE.names())) -@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line') -@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int) +@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), + default='one-per-line') +@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", + default=1, type=int) @click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int) -@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..") -@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..") +@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, " + "based on the @ID attribute. Separate start/end with ..") +@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. " + "Separate start/end with ..") @pass_workspace def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page_id_range, numeric_range): """ @@ -715,6 +770,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page lines.append(dumps(chunks)) print('\n'.join(lines)) + # ---------------------------------------------------------------------- # ocrd workspace get-id # ---------------------------------------------------------------------- @@ -730,6 +786,7 @@ def get_id(ctx): if ID: print(ID) + # ---------------------------------------------------------------------- # ocrd workspace set-id # ---------------------------------------------------------------------- @@ -749,8 +806,10 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin workspace.mets.unique_identifier = id workspace.save_mets() + @workspace_cli.command('update-page') -@click.option('--set', 'attr_value_pairs', help=f"set mets:div ATTR to VALUE. possible keys: {METS_PAGE_DIV_ATTRIBUTE.names()}", metavar="ATTR VALUE", nargs=2, multiple=True) +@click.option('--set', 'attr_value_pairs', help="set mets:div ATTR to VALUE", metavar="ATTR VALUE", + type=(click.Choice(METS_PAGE_DIV_ATTRIBUTE.names()), str), nargs=2, multiple=True) @click.option('--order', help="[DEPRECATED - use --set ATTR VALUE", metavar='ORDER') @click.option('--orderlabel', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @click.option('--contentids', help="DEPRECATED - use --set ATTR VALUE", metavar='ORDERLABEL') @@ -777,6 +836,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): print(f"Error: {err}") sys.exit(1) + # ---------------------------------------------------------------------- # ocrd workspace merge # ---------------------------------------------------------------------- @@ -784,17 +844,21 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): def _handle_json_option(ctx, param, value): return parse_json_string_or_file(value) if value else None + @workspace_cli.command('merge') @click.argument('METS_PATH') -@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH") -@click.option('--force/--no-force', is_flag=True, default=False, help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash") +@click.option('--overwrite/--no-overwrite', is_flag=True, default=False, + help="Overwrite on-disk file in case of file name conflicts with data from METS_PATH") +@click.option('--force/--no-force', is_flag=True, default=False, + help="Overwrite mets:file from --mets with mets:file from METS_PATH if IDs clash") @click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True) @click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option) @click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option) @click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option) @mets_find_options @pass_workspace -def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path): # pylint: disable=redefined-builtin +def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, + file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path): # pylint: disable=redefined-builtin """ Merges this workspace with the workspace that contains ``METS_PATH`` @@ -829,18 +893,20 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa ) workspace.save_mets() + # ---------------------------------------------------------------------- # ocrd workspace backup # ---------------------------------------------------------------------- @workspace_cli.group('backup') @pass_workspace -def workspace_backup_cli(ctx): # pylint: disable=unused-argument +def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" + @workspace_backup_cli.command('add') @pass_workspace def workspace_backup_add(ctx): @@ -850,6 +916,7 @@ def workspace_backup_add(ctx): backup_manager = ctx.backup_manager() backup_manager.add() + @workspace_backup_cli.command('list') @pass_workspace def workspace_backup_list(ctx): @@ -860,9 +927,10 @@ def workspace_backup_list(ctx): for b in backup_manager.list(): print(b) + @workspace_backup_cli.command('restore') @click.option('-f', '--choose-first', help="Restore first matching version if more than one", is_flag=True) -@click.argument('bak') #, type=click.Path(dir_okay=False, readable=True, resolve_path=True)) +@click.argument('bak') # type=click.Path(dir_okay=False, readable=True, resolve_path=True)) @pass_workspace def workspace_backup_restore(ctx, choose_first, bak): """ @@ -871,6 +939,7 @@ def workspace_backup_restore(ctx, choose_first, bak): backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) + @workspace_backup_cli.command('undo') @pass_workspace def workspace_backup_undo(ctx): @@ -887,34 +956,38 @@ def workspace_backup_undo(ctx): @workspace_cli.group('server') @pass_workspace -def workspace_serve_cli(ctx): # pylint: disable=unused-argument +def workspace_serve_cli(ctx): # pylint: disable=unused-argument """Control a METS server for this workspace""" assert ctx.mets_server_url, "For METS server commands, you must provide '-U/--mets-server-url'" + @workspace_serve_cli.command('stop') @pass_workspace -def workspace_serve_stop(ctx): # pylint: disable=unused-argument +def workspace_serve_stop(ctx): # pylint: disable=unused-argument """Stop the METS server (saving changes to disk)""" workspace = ctx.workspace() workspace.mets.stop() + @workspace_serve_cli.command('reload') @pass_workspace -def workspace_serve_reload(ctx): # pylint: disable=unused-argument +def workspace_serve_reload(ctx): # pylint: disable=unused-argument """Reload the METS server from disk""" workspace = ctx.workspace() workspace.mets.reload() + @workspace_serve_cli.command('save') @pass_workspace -def workspace_serve_save(ctx): # pylint: disable=unused-argument +def workspace_serve_save(ctx): # pylint: disable=unused-argument """Save the METS changes to disk""" workspace = ctx.workspace() workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace -def workspace_serve_start(ctx): # pylint: disable=unused-argument +def workspace_serve_start(ctx): # pylint: disable=unused-argument """ Start a METS server diff --git a/src/ocrd/cli/zip.py b/src/ocrd/cli/zip.py index 7db18b09c..89bb7ff84 100644 --- a/src/ocrd/cli/zip.py +++ b/src/ocrd/cli/zip.py @@ -16,6 +16,7 @@ from ..workspace import Workspace from ..workspace_bagger import WorkspaceBagger + @click.group("zip") def zip_cli(): """ @@ -23,6 +24,7 @@ def zip_cli(): """ initLogging() + # ---------------------------------------------------------------------- # ocrd zip bag # ---------------------------------------------------------------------- @@ -43,10 +45,12 @@ def zip_cli(): @click.option('-i', '--identifier', '--id', help="Ocrd-Identifier", required=True) @click.option('-m', '--mets', help="location of mets.xml in the bag's data dir", default=DEFAULT_METS_BASENAME) @click.option('-b', '--base-version-checksum', help="Ocrd-Base-Version-Checksum") -@click.option('-t', '--tag-file', help="Add a non-payload file to bag", type=click.Path(file_okay=True, dir_okay=False, readable=True, resolve_path=True), multiple=True) +@click.option('-t', '--tag-file', help="Add a non-payload file to bag", multiple=True, + type=click.Path(file_okay=True, dir_okay=False, readable=True, resolve_path=True)) @click.option('-Z', '--skip-zip', help="Create a directory but do not ZIP it", is_flag=True, default=False) @click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1) -def bag(directory, mets_basename, dest, include_fileGrp, exclude_fileGrp, identifier, mets, base_version_checksum, tag_file, skip_zip, processes): +def bag(directory, mets_basename, dest, include_fileGrp, exclude_fileGrp, identifier, mets, + base_version_checksum, tag_file, skip_zip, processes): """ Bag workspace as OCRD-ZIP at DEST """ @@ -66,6 +70,7 @@ def bag(directory, mets_basename, dest, include_fileGrp, exclude_fileGrp, identi exclude_fileGrp=exclude_fileGrp, ) + # ---------------------------------------------------------------------- # ocrd zip spill # ---------------------------------------------------------------------- @@ -89,6 +94,7 @@ def spill(dest, src): workspace = workspace_bagger.spill(src, dest) print(workspace) + # ---------------------------------------------------------------------- # ocrd zip validate # ---------------------------------------------------------------------- @@ -97,8 +103,10 @@ def spill(dest, src): @click.argument('src', type=click.Path(dir_okay=True, readable=True, resolve_path=True), required=True) @click.option('-Z', '--skip-unzip', help="Treat SRC as a directory not a ZIP", is_flag=True, default=False) @click.option('-B', '--skip-bag', help="Whether to skip all checks of manifests and files", is_flag=True, default=False) -@click.option('-C', '--skip-checksums', help="Whether to omit checksum checks but still check basic BagIt conformance", is_flag=True, default=False) -@click.option('-D', '--skip-delete', help="Whether to skip deleting the unpacked OCRD-ZIP dir after valdiation", is_flag=True, default=False) +@click.option('-C', '--skip-checksums', help="Whether to omit checksum checks but still check basic BagIt conformance", + is_flag=True, default=False) +@click.option('-D', '--skip-delete', help="Whether to skip deleting the unpacked OCRD-ZIP dir after valdiation", + is_flag=True, default=False) @click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1) def validate(src, **kwargs): """ @@ -113,6 +121,7 @@ def validate(src, **kwargs): if not report.is_valid: sys.exit(1) + # ---------------------------------------------------------------------- # ocrd zip update # ---------------------------------------------------------------------- diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index cc344a599..553b6fa57 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -30,7 +30,7 @@ def ocrd_cli_wrap_processor( working_dir=None, dump_json=False, dump_module_dir=False, - help=False, # pylint: disable=redefined-builtin + help=False, # pylint: disable=redefined-builtin profile=False, profile_file=None, version=False, @@ -110,10 +110,10 @@ def resolve(name): if not kwargs.get('input_file_grp', None): raise ValueError('-I/--input-file-grp is required') if 'output_file_grp' not in kwargs: - raise ValueError('-O/--output-file-grp is required') # actually, it may be None + raise ValueError('-O/--output-file-grp is required') # actually, it may be None resolver = Resolver() working_dir, mets, _, mets_server_url = \ - resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) + resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url) workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url) page_id = kwargs.get('page_id') if debug: @@ -122,7 +122,10 @@ def resolve(name): config.OCRD_EXISTING_OUTPUT = 'ABORT' if overwrite: config.OCRD_EXISTING_OUTPUT = 'OVERWRITE' - report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) + report = WorkspaceValidator.check_file_grp(workspace, + kwargs['input_file_grp'], + '' if overwrite else kwargs['output_file_grp'], + page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) # Set up profiling behavior from environment variables/flags @@ -138,6 +141,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() + def goexit(): pr.disable() print("Profiling completed") @@ -146,6 +150,7 @@ def goexit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) + atexit.register(goexit) if log_filename: log_ctx = redirect_stderr_and_stdout_to_file(log_filename) @@ -162,7 +167,8 @@ def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, d SUBCOMMANDS = [AgentType.PROCESSING_WORKER, AgentType.PROCESSOR_SERVER] if not subcommand: - raise ValueError(f"Subcommand options --address --queue and --database are only valid for subcommands: {SUBCOMMANDS}") + raise ValueError("Subcommand options --address --queue and --database " + f"are only valid for subcommands: {SUBCOMMANDS}") if subcommand not in SUBCOMMANDS: raise ValueError(f"SUBCOMMAND can only be one of {SUBCOMMANDS}") diff --git a/src/ocrd/decorators/loglevel_option.py b/src/ocrd/decorators/loglevel_option.py index 9edd01345..86c3a01b7 100644 --- a/src/ocrd/decorators/loglevel_option.py +++ b/src/ocrd/decorators/loglevel_option.py @@ -1,14 +1,17 @@ import click from ocrd_utils.logging import setOverrideLogLevel + __all__ = ['ocrd_loglevel'] + def _setOverrideLogLevel(ctx, param, value): # pylint: disable=unused-argument if value is None: # Explicitly test for None because logging.DEBUG == 0 return setOverrideLogLevel(value) return value + loglevel_option = click.option('-l', '--log-level', help="Log level", type=click.Choice([ 'OFF', 'ERROR', 'WARN', @@ -16,6 +19,7 @@ def _setOverrideLogLevel(ctx, param, value): # pylint: disable=unused-argumen ]), default=None, callback=_setOverrideLogLevel) + def ocrd_loglevel(f): """ Add an option '--log-level' to set the log level. diff --git a/src/ocrd/decorators/mets_find_options.py b/src/ocrd/decorators/mets_find_options.py index f604605d3..c4faa2df0 100644 --- a/src/ocrd/decorators/mets_find_options.py +++ b/src/ocrd/decorators/mets_find_options.py @@ -1,7 +1,8 @@ from click import option + def mets_find_options(f): - for opt in [ + for opt in [ option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER'), option('-m', '--mimetype', help="Media type to look for", metavar='FILTER'), option('-g', '--page-id', help="Page ID", metavar='FILTER'), diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index c99dcdb6d..e8c3d8685 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -1,5 +1,5 @@ import click -from click import option, Path, group, command, argument +from click import option, Path, argument from ocrd_utils import DEFAULT_METS_BASENAME from ocrd_network import AgentType from .parameter_option import parameter_option, parameter_override_option diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 2f8be3d86..7258c7e6a 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -7,17 +7,18 @@ def _handle_param_option(ctx, param, value): from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) + parameter_option = option('-p', '--parameter', - help="Parameters, either JSON string or path to JSON file", - multiple=True, - default=[], - # now handled in ocrd_cli_wrap_processor to resolve processor preset files - # callback=_handle_param_option - callback=lambda ctx, param, kv: list(kv)) + help="Parameters, either JSON string or path to JSON file", + multiple=True, + default=[], + # now handled in ocrd_cli_wrap_processor to resolve processor preset files + # callback=_handle_param_option + callback=lambda ctx, param, kv: list(kv)) parameter_override_option = option('-P', '--parameter-override', - help="Parameter override", - nargs=2, - multiple=True, - callback=lambda ctx, param, kv: kv) - # callback=lambda ctx, param, kv: {kv[0]: kv[1]}) + help="Parameter override", + nargs=2, + multiple=True, + # callback=lambda ctx, param, kv: {kv[0]: kv[1]}) + callback=lambda ctx, param, kv: kv) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index e4eaa7b77..ed3b699ff 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -3,12 +3,12 @@ """ import os import re -from os import _exit, chmod +from os import chmod import signal from typing import Dict, Optional, Union, List, Tuple from time import sleep from pathlib import Path -from subprocess import Popen, run as subprocess_run +from subprocess import Popen from urllib.parse import urlparse import socket import atexit @@ -424,7 +424,7 @@ def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int # Wait for the mets server to start sleep(2) if sub_process.poll(): - raise RuntimeError(f"Mets server starting failed. See {log_file} for errors") + raise RuntimeError(f"Starting METS Server failed. See {log_file} for errors") return sub_process.pid @staticmethod @@ -433,12 +433,12 @@ def kill_process(mets_server_pid: int): sleep(3) try: os.kill(mets_server_pid, signal.SIGKILL) - except ProcessLookupError as e: + except ProcessLookupError: pass def shutdown(self): pid = os.getpid() - self.log.info(f"Shutdown method of mets server[{pid}] invoked, sending SIGTERM signal.") + self.log.info(f"Shutdown method of METS Server[{pid}] invoked, sending SIGTERM signal.") os.kill(pid, signal.SIGTERM) if self.is_uds: if Path(self.url).exists(): @@ -446,7 +446,7 @@ def shutdown(self): Path(self.url).unlink() def startup(self): - self.log.info(f"Configuring the Mets Server") + self.log.info("Configuring the METS Server") workspace = self.workspace @@ -516,10 +516,6 @@ async def physical_pages(): self.log.debug(f"GET /physical_pages -> {response}") return response - @app.get(path='/physical_pages', response_model=OcrdPageListModel) - async def physical_pages(): - return {'physical_pages': workspace.mets.physical_pages} - @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): response = {'file_groups': workspace.mets.file_groups} diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 24d7e16cd..8b46b1f49 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -16,7 +16,7 @@ import os from os import getcwd from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, get_args +from typing import Dict, List, Optional, Tuple, Union, get_args import sys import logging import logging.handlers @@ -68,7 +68,7 @@ from ocrd_validators.ocrd_tool_validator import OcrdToolValidator # XXX imports must remain for backwards-compatibility -from .helpers import run_cli, run_processor # pylint: disable=unused-import +from .helpers import run_cli, run_processor # pylint: disable=unused-import class ResourceNotFoundError(FileNotFoundError): @@ -83,6 +83,7 @@ def __init__(self, name, executable): f"Try 'ocrd resmgr download {executable} {name}' to download this resource.") super().__init__(self.message) + class NonUniqueInputFile(ValueError): """ An exception signifying the specified fileGrp / pageId / mimetype @@ -97,6 +98,7 @@ def __init__(self, fileGrp, pageId, mimetype): f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) + class MissingInputFile(ValueError): """ An exception signifying the specified fileGrp / pageId / mimetype @@ -111,6 +113,7 @@ def __init__(self, fileGrp, pageId, mimetype): f"and pageId {pageId} under mimetype {mimetype or 'PAGE+image(s)'}") super().__init__(self.message) + class DummyFuture: """ Mimics some of `concurrent.futures.Future` but runs immediately. @@ -119,8 +122,11 @@ def __init__(self, fn, *args, **kwargs): self.fn = fn self.args = args self.kwargs = kwargs + def result(self): return self.fn(*self.args, **self.kwargs) + + class DummyExecutor: """ Mimics some of `concurrent.futures.ProcessPoolExecutor` but runs @@ -128,15 +134,19 @@ class DummyExecutor: """ def __init__(self, initializer=None, initargs=(), **kwargs): initializer(*initargs) + def shutdown(self, **kwargs): # allow gc to catch processor instance (unless cached) _page_worker_set_ctxt(None, None) + def submit(self, fn, *args, **kwargs) -> DummyFuture: return DummyFuture(fn, *args, **kwargs) + TFuture = Union[DummyFuture, Future] TExecutor = Union[DummyExecutor, ProcessPoolExecutor] + class Processor(): """ A processor is a tool that implements the uniform OCR-D @@ -149,7 +159,7 @@ class Processor(): parameters. """ - max_instances : int = -1 + max_instances: int = -1 """ maximum number of cached instances (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PROCESSOR_CACHE` (i.e. whatever is smaller). @@ -157,7 +167,7 @@ class Processor(): (Override this if you know how many instances fit into memory - GPU / CPU RAM - at once.) """ - max_workers : int = -1 + max_workers: int = -1 """ maximum number of processor forks for page-parallel processing (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_MAX_PARALLEL_PAGES` (i.e. @@ -167,7 +177,7 @@ class Processor(): - at once, or if your class already creates threads prior to forking, e.g. during ``setup``.) """ - max_page_seconds : int = -1 + max_page_seconds: int = -1 """ maximum number of seconds may be spent processing a single page (ignored if negative), to be applied on top of :py:data:`~ocrd_utils.config.OCRD_PROCESSING_PAGE_TIMEOUT` @@ -284,7 +294,7 @@ def parameter(self) -> Optional[dict]: return None @parameter.setter - def parameter(self, parameter : dict) -> None: + def parameter(self, parameter: dict) -> None: if self.parameter is not None: self.shutdown() parameterValidator = ParameterValidator(self.ocrd_tool) @@ -299,7 +309,7 @@ def parameter(self, parameter : dict) -> None: def __init__( self, # FIXME: remove in favor of process_workspace(workspace) - workspace : Optional[Workspace], + workspace: Optional[Workspace], ocrd_tool=None, parameter=None, input_file_grp=None, @@ -365,8 +375,10 @@ def __init__( if parameter is not None: self.parameter = parameter # workaround for deprecated#72 (@deprecated decorator does not work for subclasses): - setattr(self, 'process', - deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()')(getattr(self, 'process'))) + setattr(self, 'process', deprecated( + version='3.0', reason='process() should be replaced ' + 'with process_page_pcgts() or process_page_file() or process_workspace()')( + getattr(self, 'process'))) def __del__(self): self._base_logger.debug("shutting down %s in %s", repr(self), mp.current_process().name) @@ -394,7 +406,8 @@ def verify(self): assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') - def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): + + def assert_file_grp_cardinality(grps: List[str], spec: Union[int, List[int]], msg): if isinstance(spec, int): if spec > 0: assert len(grps) == spec, msg % (len(grps), str(spec)) @@ -418,10 +431,10 @@ def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], assert input_file_grp in self.workspace.mets.file_groups, \ f"input fileGrp {input_file_grp} does not exist in workspace {self.workspace}" for output_file_grp in output_file_grps: - assert output_file_grp not in self.workspace.mets.file_groups \ - or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] \ - or not any(self.workspace.mets.find_files( - pageId=self.page_id, fileGrp=output_file_grp)), \ + assert (output_file_grp not in self.workspace.mets.file_groups + or config.OCRD_EXISTING_OUTPUT in ['OVERWRITE', 'SKIP'] + or not any(self.workspace.mets.find_files( + pageId=self.page_id, fileGrp=output_file_grp))), \ f"output fileGrp {output_file_grp} already exists in workspace {self.workspace}" # keep this for backwards compatibility: return True @@ -465,7 +478,8 @@ def shutdown(self) -> None: """ pass - @deprecated(version='3.0', reason='process() should be replaced with process_page_pcgts() or process_page_file() or process_workspace()') + @deprecated(version='3.0', reason='process() should be replaced ' + 'with process_page_pcgts() or process_page_file() or process_workspace()') def process(self) -> None: """ Process all files of the :py:data:`workspace` @@ -528,7 +542,8 @@ def process_workspace(self, workspace: Workspace) -> None: ) if max_workers > 1: # forward messages from log queue (in subprocesses) to all root handlers - log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, respect_handler_level=True) + log_listener = logging.handlers.QueueListener(log_queue, *logging.root.handlers, + respect_handler_level=True) log_listener.start() tasks = None try: @@ -553,7 +568,8 @@ def process_workspace(self, workspace: Workspace) -> None: # suppress the NotImplementedError context raise err from None - def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int) -> Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: + def process_workspace_submit_tasks(self, executor: TExecutor, max_seconds: int) -> Dict[ + TFuture, Tuple[str, List[Optional[OcrdFileType]]]]: """ Look up all input files of the given ``workspace`` from the given :py:data:`input_file_grp` @@ -571,7 +587,7 @@ def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int Otherwise, tasks are run sequentially in the current process. - Delegates to :py:meth:`.zip_input_files` to get + Delegates to :py:meth:`.zip_input_files` to get the input files for each page, and then calls :py:meth:`.process_workspace_submit_page_task`. @@ -586,7 +602,9 @@ def process_workspace_submit_tasks(self, executor : TExecutor, max_seconds : int self._base_logger.debug("submitted %d processing tasks", len(tasks)) return tasks - def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : int, input_file_tuple : List[Optional[OcrdFileType]]) -> Tuple[TFuture, str, List[Optional[OcrdFileType]]]: + def process_workspace_submit_page_task(self, executor: TExecutor, max_seconds: int, + input_file_tuple: List[Optional[OcrdFileType]]) -> Tuple[ + TFuture, str, List[Optional[OcrdFileType]]]: """ Ensure all input files for a single page are downloaded to the workspace, then schedule @@ -604,7 +622,7 @@ def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : - the corresponding pageId, - the corresponding input files. """ - input_files : List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) + input_files: List[Optional[OcrdFileType]] = [None] * len(input_file_tuple) page_id = next(input_file.pageId for input_file in input_file_tuple if input_file) @@ -625,7 +643,8 @@ def process_workspace_submit_page_task(self, executor : TExecutor, max_seconds : #executor.submit(self.process_page_file, *input_files) return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files - def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[int, int, Dict[str, int], int]: + def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[ + int, int, Dict[str, int], int]: """ Look up scheduled per-page futures one by one, handle errors (exceptions) and gather results. @@ -650,7 +669,7 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O # aggregate info for logging: nr_succeeded = 0 nr_failed = 0 - nr_errors = defaultdict(int) # count causes + nr_errors = defaultdict(int) # count causes if config.OCRD_MISSING_OUTPUT == 'SKIP': reason = "skipped" elif config.OCRD_MISSING_OUTPUT == 'COPY': @@ -666,7 +685,8 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / len(tasks) > config.OCRD_MAX_MISSING_OUTPUTS: # already irredeemably many failures, stop short nr_errors = dict(nr_errors) - raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, {str(nr_errors)})") + raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_failed+nr_succeeded}, " + f"{str(nr_errors)})") elif result: nr_succeeded += 1 # else skipped - already exists @@ -676,13 +696,15 @@ def process_workspace_handle_tasks(self, tasks : Dict[TFuture, Tuple[str, List[O if config.OCRD_MAX_MISSING_OUTPUTS > 0 and nr_failed / nr_all > config.OCRD_MAX_MISSING_OUTPUTS: raise Exception(f"too many failures with {reason} output ({nr_failed} of {nr_all}, {str(nr_errors)})") self._base_logger.warning("%s %d of %d pages due to %s", reason, nr_failed, nr_all, str(nr_errors)) - self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s", nr_succeeded, nr_failed, nr_all, str(nr_errors)) + self._base_logger.debug("succeeded %d, missed %d of %d pages due to %s", + nr_succeeded, nr_failed, nr_all, str(nr_errors)) return nr_succeeded, nr_failed, nr_errors, len(tasks) - def process_workspace_handle_page_task(self, page_id : str, input_files : List[Optional[OcrdFileType]], task : TFuture) -> Union[bool, Exception]: + def process_workspace_handle_page_task(self, page_id: str, input_files: List[Optional[OcrdFileType]], + task: TFuture) -> Union[bool, Exception]: """ \b - Await a single page result and handle errors (exceptions), + Await a single page result and handle errors (exceptions), enforcing policies configured by the following environment variables: - `OCRD_EXISTING_OUTPUT` (abort/skip/overwrite) @@ -738,14 +760,14 @@ def process_workspace_handle_page_task(self, page_id : str, input_files : List[O raise ValueError(f"unknown configuration value {config.OCRD_MISSING_OUTPUT} - {desc}") return err - def _copy_page_file(self, input_file : OcrdFileType) -> None: + def _copy_page_file(self, input_file: OcrdFileType) -> None: """ Copy the given ``input_file`` of the :py:data:`workspace`, representing one physical page (passed as one opened :py:class:`~ocrd_models.OcrdFile` per input fileGrp) and add it as if it was a processing result. """ - input_pcgts : OcrdPage + input_pcgts: OcrdPage assert isinstance(input_file, get_args(OcrdFileType)) self._base_logger.debug(f"parsing file {input_file.ID} for page {input_file.pageId}") try: @@ -766,7 +788,7 @@ def _copy_page_file(self, input_file : OcrdFileType) -> None: content=to_xml(input_pcgts), ) - def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: """ Process the given ``input_files`` of the :py:data:`workspace`, representing one physical page (passed as one opened @@ -777,7 +799,7 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: (This uses :py:meth:`.process_page_pcgts`, but should be overridden by subclasses to handle cases like multiple output fileGrps, non-PAGE input etc.) """ - input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + input_pcgts: List[Optional[OcrdPage]] = [None] * len(input_files) input_pos = next(i for i, input_file in enumerate(input_files) if input_file is not None) page_id = input_files[input_pos].pageId self._base_logger.info("processing page %s", page_id) @@ -827,7 +849,7 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: elif isinstance(image_result.alternative_image, AlternativeImageType): image_result.alternative_image.set_filename(image_file_path) elif image_result.alternative_image is None: - pass # do not reference in PAGE result + pass # do not reference in PAGE result else: raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type " f"{type(image_result.alternative_image)}") @@ -849,7 +871,7 @@ def process_page_file(self, *input_files : Optional[OcrdFileType]) -> None: content=to_xml(result.pcgts), ) - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:data:`.workspace`, representing one physical page (passed as one parsed @@ -876,24 +898,25 @@ def add_metadata(self, pcgts: OcrdPage) -> None: """ metadata_obj = pcgts.get_Metadata() assert metadata_obj is not None - metadata_obj.add_MetadataItem( - MetadataItemType(type_="processingStep", - name=self.ocrd_tool['steps'][0], - value=self.ocrd_tool['executable'], - Labels=[LabelsType( - externalModel="ocrd-tool", - externalId="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()]), - LabelsType( + metadata_item = MetadataItemType( + type_="processingStep", + name=self.ocrd_tool['steps'][0], + value=self.ocrd_tool['executable'], + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()]), + LabelsType( externalModel="ocrd-tool", externalId="version", Label=[LabelType(type_=self.ocrd_tool['executable'], value=self.version), LabelType(type_='ocrd/core', value=OCRD_VERSION)]) - ])) + ]) + metadata_obj.add_MetadataItem(metadata_item) def resolve_resource(self, val): """ @@ -948,8 +971,8 @@ def list_all_resources(self): mimetypes = get_processor_resource_types(None, self.ocrd_tool) for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir): res = Path(res) - if not '*/*' in mimetypes: - if res.is_dir() and not 'text/directory' in mimetypes: + if '*/*' not in mimetypes: + if res.is_dir() and 'text/directory' not in mimetypes: continue # if we do not know all MIME types, then keep the file, otherwise require suffix match if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix) @@ -1070,16 +1093,18 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: - self._base_logger.debug(f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") + self._base_logger.debug( + f"another file {file_.ID} for page {file_.pageId} in input file group {ifg}") # fileGrp has multiple files for this page ID if mimetype: # filter was active, this must not happen - self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " - f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") + self._base_logger.warning( + f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} of same MIME type {mimetype} - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': - pass # keep first match + pass # keep first match elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': @@ -1088,18 +1113,19 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): raise Exception("Unknown 'on_error' strategy '%s'" % on_error) elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype != MIMETYPE_PAGE): - pass # keep PAGE match + pass # keep PAGE match elif (ift[i].mimetype == MIMETYPE_PAGE and file_.mimetype == MIMETYPE_PAGE): raise NonUniqueInputFile(ifg, file_.pageId, None) else: # filter was inactive but no PAGE is in control, this must not happen - self._base_logger.warning(f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " - f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") + self._base_logger.warning( + f"added file {file_.ID} for page {file_.pageId} in input file group {ifg} " + f"conflicts with file {ift[i].ID} but no PAGE available - on_error={on_error}") if on_error == 'skip': ift[i] = None elif on_error == 'first': - pass # keep first match + pass # keep first match elif on_error == 'last': ift[i] = file_ elif on_error == 'abort': @@ -1133,6 +1159,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ifts.append(tuple(ifiles)) return ifts + _page_worker_processor = None """ This global binding for the processor is required to avoid @@ -1143,6 +1170,8 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): objects, and with the METS Server we do not mutate the local processor instance anyway. """ + + def _page_worker_set_ctxt(processor, log_queue): """ Overwrites `ocrd.processor.base._page_worker_processor` instance @@ -1154,6 +1183,7 @@ def _page_worker_set_ctxt(processor, log_queue): # replace all log handlers with just one queue handler logging.root.handlers = [logging.handlers.QueueHandler(log_queue)] + def _page_worker(timeout, *input_files): """ Wraps a `Processor.process_page_file` call as payload (call target) @@ -1171,6 +1201,7 @@ def _page_worker(timeout, *input_files): _page_worker_processor.logger.debug("page worker timed out for page %s", page_id) raise + def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None): """Generate a string describing the full CLI of this processor including params. diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index bf7e2940b..a2fec5fd2 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -16,6 +16,7 @@ ) from ocrd_modelfactory import page_from_file + class DummyProcessor(Processor): """ Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group @@ -57,13 +58,14 @@ def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: page_id=input_file.pageId, local_filename=join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts), - ) + content=to_xml(pcgts)) else: if self.parameter['copy_files']: - self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", input_file.local_filename) + self.logger.info("Not copying %s because it is a PAGE-XML file, which gets identity-transformed", + input_file.local_filename) else: - self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", input_file.local_filename) + self.logger.info("Not copying %s because it is not a PAGE-XML file and copy_files was false", + input_file.local_filename) # we can rely on base implementation verbatim super().process_page_file(input_file) @@ -75,6 +77,7 @@ def metadata_filename(self): def executable(self): return 'ocrd-dummy' + @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/builtin/filter_processor.py b/src/ocrd/processor/builtin/filter_processor.py index c81517b0e..ae79702aa 100644 --- a/src/ocrd/processor/builtin/filter_processor.py +++ b/src/ocrd/processor/builtin/filter_processor.py @@ -1,7 +1,6 @@ # pylint: disable=missing-module-docstring,invalid-name from typing import Optional -from lxml import etree import click from ocrd import Processor, OcrdPageResult, OcrdPageResultImage @@ -29,6 +28,7 @@ "Glyph" ] + class FilterProcessor(Processor): def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ @@ -57,7 +57,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # but allow only hierarchy segments segments = [segment for segment in map(pcgts.revmap.get, nodes) if segment.__class__.__name__.replace('Type', '') in _SEGTYPES] - if not(len(segments)): + if not len(segments): self.logger.info("no matches") return result rodict = pcgts.get_Page().get_ReadingOrderGroups() @@ -102,6 +102,7 @@ def metadata_filename(self): def executable(self): return 'ocrd-filter' + @click.command() @ocrd_cli_options def cli(*args, **kwargs): diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index eb191e72c..188e627e4 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -5,7 +5,6 @@ from os import times from functools import lru_cache import json -import inspect from subprocess import run from typing import List, Optional @@ -28,6 +27,7 @@ def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=Non workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir, mets_server_url=mets_server_url) return workspace + def run_processor( processorClass, mets_url=None, @@ -41,7 +41,7 @@ def run_processor( working_dir=None, mets_server_url=None, instance_caching=False -): # pylint: disable=too-many-locals +): # pylint: disable=too-many-locals """ Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace. @@ -104,7 +104,7 @@ def run_processor( t0_os = times() if any(x in config.OCRD_PROFILE for x in ['RSS', 'PSS']): backend = 'psutil_pss' if 'PSS' in config.OCRD_PROFILE else 'psutil' - from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel + from memory_profiler import memory_usage # pylint: disable=import-outside-toplevel try: mem_usage = memory_usage(proc=(processor.process_workspace, [workspace], {}), # only run process once @@ -225,7 +225,6 @@ def run_cli( return result.returncode - # not decorated here but at runtime (on first use) #@freeze_args #@lru_cache(maxsize=config.OCRD_MAX_PROCESSOR_CACHE) @@ -245,6 +244,7 @@ def get_cached_processor(parameter: dict, processor_class): return processor return None + def get_processor( processor_class, parameter: Optional[dict] = None, diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py index 5f21a72f5..fd1f82f5b 100644 --- a/src/ocrd/processor/ocrd_page_result.py +++ b/src/ocrd/processor/ocrd_page_result.py @@ -5,13 +5,15 @@ from ocrd_models.ocrd_page_generateds import AlternativeImageType, PageType + @dataclass class OcrdPageResultImage(): - pil : Image - file_id_suffix : str - alternative_image : Optional[Union[AlternativeImageType, PageType]] + pil: Image + file_id_suffix: str + alternative_image: Optional[Union[AlternativeImageType, PageType]] + @dataclass class OcrdPageResult(): - pcgts : OcrdPage - images : List[OcrdPageResultImage] = field(default_factory=list) + pcgts: OcrdPage + images: List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd/resolver.py b/src/ocrd/resolver.py index 4ca0c266e..9276c318f 100644 --- a/src/ocrd/resolver.py +++ b/src/ocrd/resolver.py @@ -20,6 +20,7 @@ from ocrd_models import OcrdMets from ocrd_models.utils import handle_oai_response + class Resolver(): """ Handle uploads, downloads, repository access, and manage temporary directories @@ -31,11 +32,13 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', If ``url`` looks like a file path, check whether that exists. If it does exist and is within ``directory` already, return early. - If it does exist but is outside of ``directory``. copy it. - If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. + If it does exist but is outside of ``directory``, copy it. + If ``url` does not appear to be a file path, try downloading via HTTP, + retrying ``retries`` times with timeout ``timeout`` between calls. If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. + \b If the target file already exists within ``directory``, behavior depends on ``if_exists``: - ``skip`` (default): do nothing and return early. Note that this - ``overwrite``: overwrite the existing file @@ -56,11 +59,12 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', Returns: Local filename string, *relative* to directory """ - log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name - log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) + log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name + log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", + directory, url, basename, if_exists, subdir) if not url: - raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok + raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok if not directory: raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work @@ -123,25 +127,25 @@ def download_to_directory(self, directory, url, basename=None, if_exists='skip', retries = Retry(total=retries or 0, status_forcelist=[ # probably too wide (only transient failures): - 408, # Request Timeout - 409, # Conflict - 412, # Precondition Failed - 417, # Expectation Failed - 423, # Locked - 424, # Fail - 425, # Too Early - 426, # Upgrade Required - 428, # Precondition Required - 429, # Too Many Requests - 440, # Login Timeout - 500, # Internal Server Error - 503, # Service Unavailable - 504, # Gateway Timeout - 509, # Bandwidth Limit Exceeded - 529, # Site Overloaded - 598, # Proxy Read Timeout - 599, # Proxy Connect Timeout - ]) + 408, # Request Timeout + 409, # Conflict + 412, # Precondition Failed + 417, # Expectation Failed + 423, # Locked + 424, # Fail + 425, # Too Early + 426, # Upgrade Required + 428, # Precondition Required + 429, # Too Many Requests + 440, # Login Timeout + 500, # Internal Server Error + 503, # Service Unavailable + 504, # Gateway Timeout + 509, # Bandwidth Limit Exceeded + 529, # Site Overloaded + 598, # Proxy Read Timeout + 599, # Proxy Connect Timeout + ]) adapter = HTTPAdapter(max_retries=retries) session.mount('http://', adapter) session.mount('https://', adapter) @@ -181,7 +185,7 @@ def workspace_from_url( the filesystem directly. **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True - Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless + Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless the former is already local and the latter is ``none`` or already identical to its directory name. Returns: @@ -218,11 +222,13 @@ def workspace_from_url( Path(dst_dir).mkdir(parents=True, exist_ok=False) dst_dir = str(Path(dst_dir).resolve()) - log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", - mets_basename, mets_url, src_baseurl, dst_dir) - self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') + log.debug("mets_basename='%s' mets_url='%s' src_baseurl='%s' dst_dir='%s'", + mets_basename, mets_url, src_baseurl, dst_dir) + self.download_to_directory(dst_dir, mets_url, basename=mets_basename, + if_exists='overwrite' if clobber_mets else 'raise') - workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) + workspace = Workspace(self, dst_dir, + mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) if download: for f in workspace.mets.find_files(**kwargs): @@ -273,7 +279,8 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS # if directory and mets_url and not mets_is_remote: # raise ValueError("Use either --mets or --directory, not both") - # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) + # If --mets is a URL, a directory must be explicitly provided + # (not strictly necessary, but retained for legacy behavior) if not directory and mets_is_remote: raise ValueError("--mets is an http(s) URL but no --directory was given") @@ -297,7 +304,7 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS elif not directory and mets_url: mets_url = Path(mets_url).resolve() directory = mets_url.parent - else: # == directory and mets_url: + else: # == directory and mets_url: directory = Path(directory).resolve() if not mets_is_remote: # --mets is just a basename and --directory is set, so treat --mets as --mets-basename @@ -306,10 +313,13 @@ def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS else: mets_url = Path(mets_url).resolve() if not is_file_in_directory(directory, mets_url): - raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) + raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % ( + mets_url, directory)) if mets_server_url and not mets_server_url.startswith('http://'): # UDS socket mets_server_url = str(Path(mets_server_url).resolve()) + log.debug("directory='%s' mets_url='%s', mets_basename='%s', mets_server_url='%s'" % ( + directory, str(mets_url), str(mets_basename), mets_server_url)) return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url diff --git a/src/ocrd/task_sequence.py b/src/ocrd/task_sequence.py index 85e30b177..340534ea4 100644 --- a/src/ocrd/task_sequence.py +++ b/src/ocrd/task_sequence.py @@ -9,6 +9,7 @@ from ocrd_validators import ParameterValidator, WorkspaceValidator from ocrd_models import ValidationReport + class ProcessorTask(): @classmethod @@ -85,6 +86,7 @@ def __str__(self): ret += " -p '%s'" % json.dumps(self.parameters) return ret + def validate_tasks(tasks, workspace, page_id=None, overwrite=False): report = ValidationReport() prev_output_file_grps = workspace.mets.file_groups @@ -93,14 +95,18 @@ def validate_tasks(tasks, workspace, page_id=None, overwrite=False): first_task.validate() # first task: check input/output file groups from METS - WorkspaceValidator.check_file_grp(workspace, first_task.input_file_grps, '' if overwrite else first_task.output_file_grps, page_id, report) + WorkspaceValidator.check_file_grp(workspace, + first_task.input_file_grps, + '' if overwrite else first_task.output_file_grps, + page_id, + report) prev_output_file_grps += first_task.output_file_grps for task in tasks[1:]: task.validate() # check either existing fileGrp or output-file group of previous task matches current input_file_group for input_file_grp in task.input_file_grps: - if not input_file_grp in prev_output_file_grps: + if input_file_grp not in prev_output_file_grps: report.add_error("Input file group not contained in METS or produced by previous steps: %s" % input_file_grp) if not overwrite: WorkspaceValidator.check_file_grp(workspace, [], task.output_file_grps, page_id, report) @@ -157,5 +163,6 @@ def run_tasks(mets, log_level, page_id, task_strs, overwrite=False, mets_server_ # check output file groups are in mets for output_file_grp in task.output_file_grps: - if not output_file_grp in workspace.mets.file_groups: - raise Exception("Invalid state: expected output file group '%s' not in METS (despite processor success)" % output_file_grp) + if output_file_grp not in workspace.mets.file_groups: + raise Exception("Invalid state: expected output file group '%s' not in METS " + "(despite processor success)" % output_file_grp) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index c00c795b9..266e04c12 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -28,16 +28,13 @@ scale_coordinates, shift_coordinates, rotate_coordinates, - transform_coordinates, transpose_coordinates, crop_image, rotate_image, transpose_image, bbox_from_polygon, - polygon_from_points, xywh_from_bbox, pushd_popd, - is_local_filename, deprecated_alias, DEFAULT_METS_BASENAME, MIME_TO_EXT, @@ -51,6 +48,7 @@ __all__ = ['Workspace'] + @contextmanager def download_temporary_file(url): with NamedTemporaryFile(prefix='ocrd-download-') as f: @@ -82,7 +80,7 @@ def __init__( self, resolver, directory, - mets : Optional[Union[OcrdMets, ClientSideOcrdMets]] = None, + mets: Optional[Union[OcrdMets, ClientSideOcrdMets]] = None, mets_basename=DEFAULT_METS_BASENAME, automatic_backup=False, baseurl=None, @@ -96,8 +94,9 @@ def __init__( if self.is_remote: mets = ClientSideOcrdMets(mets_server_url, self.directory) if mets.workspace_path != self.directory: - raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " - f"from local workspace directory '{self.directory}'. These are not the same workspaces.") + raise ValueError( + f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " + f"from local workspace directory '{self.directory}'. These are not the same workspaces.") else: mets = OcrdMets(filename=self.mets_target) self.mets = mets @@ -148,7 +147,7 @@ def after_add_cb(f): if not copy_files: fpath_src = Path(other_workspace.directory).resolve() fpath_dst = Path(self.directory).resolve() - dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath + dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath f.local_filename = dstprefix / f.local_filename return fpath_src = Path(other_workspace.directory, f.local_filename) @@ -171,7 +170,6 @@ def after_add_cb(f): self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs) - @deprecated(version='1.0.0', reason="Use workspace.download_file") def download_url(self, url, **kwargs): """ @@ -199,19 +197,23 @@ def download_file(self, f, _recursion_count=0): file_path = Path(f.local_filename).absolute() if file_path.exists(): try: - file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative + file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative # If the f.local_filename exists and is within self.directory, nothing to do log.debug(f"'local_filename' {f.local_filename} already within {self.directory} - nothing to do") except ValueError: # f.local_filename exists, but not within self.directory, copy it - log.debug("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) - f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) + log.debug("Copying 'local_filename' %s to workspace directory %s" % ( + f.local_filename, self.directory)) + f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, + subdir=f.fileGrp) return f if f.url: - log.debug("OcrdFile has 'local_filename' but it doesn't resolve - trying to download from 'url' %s", f.url) + log.debug("OcrdFile has 'local_filename' but it doesn't resolve - " + "trying to download from 'url' %s", f.url) url = f.url elif self.baseurl: - log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - trying 'baseurl' %s with 'local_filename' %s", + log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - " + "trying 'baseurl' %s with 'local_filename' %s", self.baseurl, f.local_filename) url = '%s/%s' % (self.baseurl, f.local_filename) else: @@ -223,7 +225,8 @@ def download_file(self, f, _recursion_count=0): if f.url: # If f.url is set, download the file to the workspace basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename - f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) + f.local_filename = self.resolver.download_to_directory(self.directory, f.url, + subdir=f.fileGrp, basename=basename) return f # If neither f.local_filename nor f.url is set, fail raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") @@ -281,7 +284,8 @@ def remove_file(self, file_id, force=False, keep_file=False, page_recursive=Fals if not force: raise e - def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False): + def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, + page_recursive=False, page_same_group=False): """ Remove a METS `fileGrp`. @@ -302,7 +306,8 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, file_dirs = [] if recursive: for f in self.mets.find_files(fileGrp=USE): - self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group) + self.remove_file( + f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group) if f.local_filename: f_dir = path.dirname(f.local_filename) if f_dir: @@ -320,7 +325,6 @@ def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, if Path(file_dir).is_dir() and not listdir(file_dir): Path(file_dir).rmdir() - def rename_file_group(self, old, new): """ Rename a METS `fileGrp`. @@ -361,7 +365,8 @@ def rename_file_group(self, old, new): new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID) try: next(self.mets.find_files(ID=new_id)) - log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename)) + log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % ( + new_id, old_local_filename, new_local_filename)) except StopIteration: mets_file.ID = new_id # change file paths in PAGE-XML imageFilename and filename attributes @@ -378,7 +383,8 @@ def rename_file_group(self, old, new): for old_local_filename, new_local_filename in local_filename_replacements.items(): if ai.filename == old_local_filename: changed = True - log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename)) + log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % ( + old_local_filename, new_local_filename)) ai.filename = new_local_filename if changed: log.debug("PAGE-XML changed, writing %s" % (page_file.local_filename)) @@ -502,7 +508,7 @@ def resolve_image_as_pil(self, image_url, coords=None): def _resolve_image_as_pil(self, image_url, coords=None): log = getLogger('ocrd.workspace._resolve_image_as_pil') pil_image = self._apply_mets_file(image_url, Image.open) - pil_image.load() # alloc and give up the FD + pil_image.load() # alloc and give up the FD # Pillow does not properly support higher color depths # (e.g. 16-bit or 32-bit or floating point grayscale), @@ -544,7 +550,7 @@ def _resolve_image_as_pil(self, image_url, coords=None): # FIXME: remove or replace this by (image_from_polygon+) crop_image ... log.debug("Converting PIL to OpenCV: %s", image_url) - color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR + color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image) cv2_image = cvtColor(pil_as_np_array, color_conversion) @@ -659,8 +665,8 @@ def image_from_page(self, page, page_id, orientation = (page_coords['angle'] + 45) % 360 orientation = orientation - (orientation % 90) skew = (page_coords['angle'] % 360) - orientation - skew = 180 - (180 - skew) % 360 # map to [-45,45] - page_coords['angle'] = 0 # nothing applied yet (depends on filters) + skew = 180 - (180 - skew) % 360 # map to [-45,45] + page_coords['angle'] = 0 # nothing applied yet (depends on filters) log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id, "border," if border else "", orientation, skew) if page_image_info.resolution != 1: @@ -696,7 +702,7 @@ def image_from_page(self, page, page_id, for feature in feature_selector.split(',') if feature) and not any(feature in featureset for feature in feature_filter.split(',') if feature) and - len(featureset.difference(auto_features)) >= \ + len(featureset.difference(auto_features)) >= len(best_features.difference(auto_features))): best_features = featureset best_image = alternative_image @@ -705,7 +711,7 @@ def image_from_page(self, page, page_id, alternative_images.index(best_image) + 1, best_features, page_id) page_image = self._resolve_image_as_pil(best_image.get_filename()) - page_coords['features'] = best_image.get_comments() # including duplicates + page_coords['features'] = best_image.get_comments() # including duplicates # adjust the coord transformation to the steps applied on the image, # and apply steps on the existing image in case it is missing there, @@ -727,18 +733,18 @@ def image_from_page(self, page, page_id, for i, feature in enumerate(alternative_image_features + (['cropped'] if (border and - not 'cropped' in alternative_image_features and - not 'cropped' in feature_filter.split(',')) + 'cropped' not in alternative_image_features and + 'cropped' not in feature_filter.split(',')) else []) + (['rotated-%d' % orientation] if (orientation and - not 'rotated-%d' % orientation in alternative_image_features and - not 'rotated-%d' % orientation in feature_filter.split(',')) + 'rotated-%d' % orientation not in alternative_image_features and + 'rotated-%d' % orientation not in feature_filter.split(',')) else []) + (['deskewed'] if (skew and - not 'deskewed' in alternative_image_features and - not 'deskewed' in feature_filter.split(',')) + 'deskewed' not in alternative_image_features and + 'deskewed' not in feature_filter.split(',')) else []) + # not a feature to be added, but merely as a fallback position # to always enter loop at i == len(alternative_image_features) @@ -931,15 +937,15 @@ def image_from_segment(self, segment, parent_image, parent_coords, orientation = (angle + 45) % 360 orientation = orientation - (orientation % 90) skew = (angle % 360) - orientation - skew = 180 - (180 - skew) % 360 # map to [-45,45] + skew = 180 - (180 - skew) % 360 # map to [-45,45] log.debug("segment '%s' has orientation=%d skew=%.2f", segment.id, orientation, skew) else: orientation = 0 skew = 0 - segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) + segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) if 'DPI' in parent_coords: - segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet + segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: @@ -971,7 +977,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, for feature in feature_selector.split(',') if feature) and not any(feature in featureset for feature in feature_filter.split(',') if feature) and - len(featureset.difference(auto_features)) >= \ + len(featureset.difference(auto_features)) >= len(best_features.difference(auto_features))): best_features = featureset best_image = alternative_image @@ -980,7 +986,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, alternative_images.index(best_image) + 1, best_features, segment.id) segment_image = self._resolve_image_as_pil(alternative_image.get_filename()) - segment_coords['features'] = best_image.get_comments() # including duplicates + segment_coords['features'] = best_image.get_comments() # including duplicates alternative_image_features = segment_coords['features'].split(',') for duplicate_feature in set([feature for feature in alternative_image_features @@ -993,13 +999,13 @@ def image_from_segment(self, segment, parent_image, parent_coords, for i, feature in enumerate(alternative_image_features + (['rotated-%d' % orientation] if (orientation and - not 'rotated-%d' % orientation in alternative_image_features and - not 'rotated-%d' % orientation in feature_filter.split(',')) + 'rotated-%d' % orientation not in alternative_image_features and + 'rotated-%d' % orientation not in feature_filter.split(',')) else []) + (['deskewed'] if (skew and - not 'deskewed' in alternative_image_features and - not 'deskewed' in feature_filter.split(',')) + 'deskewed' not in alternative_image_features and + 'deskewed' not in feature_filter.split(',')) else []) + # not a feature to be added, but merely as a fallback position # to always enter loop at i == len(alternative_image_features) @@ -1052,13 +1058,13 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image : Image.Image, - file_id : str, - file_grp : str, - file_path : Optional[str] = None, - page_id : Optional[str] = None, - mimetype : str = 'image/png', - force : bool = False) -> str: + def save_image_file(self, image: Image.Image, + file_id: str, + file_grp: str, + file_path: Optional[str] = None, + page_id: Optional[str] = None, + mimetype: str = 'image/png', + force: bool = False) -> str: """Store an image in the filesystem and reference it as new file in the METS. Args: @@ -1120,6 +1126,7 @@ def find_files(self, *args, **kwargs): with pushd_popd(self.directory): return self.mets.find_files(*args, **kwargs) + def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs): segment_coords = parent_coords.copy() # get polygon outline of segment relative to parent image: @@ -1131,8 +1138,8 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg # also possibly different from size after rotation below/AlternativeImage): segment_xywh = xywh_from_bbox(*segment_bbox) # crop, if (still) necessary: - if (not isinstance(segment, BorderType) or # always crop below page level - not op in parent_coords['features']): + if (not isinstance(segment, BorderType) or # always crop below page level + op not in parent_coords['features']): if op == 'recropped': log.debug("Recropping %s", name) elif isinstance(segment, BorderType): @@ -1152,6 +1159,7 @@ def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwarg -segment_bbox[1]])) return segment_image, segment_coords, segment_xywh + def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh): # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) @@ -1159,7 +1167,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh 90: Image.Transpose.ROTATE_90, 180: Image.Transpose.ROTATE_180, 270: Image.Transpose.ROTATE_270 - }.get(orientation) # no default + }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, np.array([0.5 * segment_xywh['w'], @@ -1174,6 +1182,7 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh segment_coords['features'] += ',rotated-%d' % orientation return segment_image, segment_coords, segment_xywh + def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs): # Rotate around center in affine coordinate transform: # (consistent with image rotation or AlternativeImage below) @@ -1185,12 +1194,12 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw [segment_xywh['w'], segment_xywh['h']], skew) segment_coords['angle'] += skew # deskew, if (still) necessary: - if not 'deskewed' in segment_coords['features']: + if 'deskewed' not in segment_coords['features']: log.debug("Rotating %s by %.2f°", name, skew) segment_image = rotate_image(segment_image, skew, **kwargs) segment_coords['features'] += ',deskewed' if (segment and - (not isinstance(segment, BorderType) or # always crop below page level + (not isinstance(segment, BorderType) or # always crop below page level 'cropped' in segment_coords['features'])): # re-crop to new bbox (which may deviate # if segment polygon was not a rectangle) @@ -1198,7 +1207,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw log, name, segment, segment_image, segment_coords, op='recropped', **kwargs) elif (segment and - (not isinstance(segment, BorderType) or # always crop below page level + (not isinstance(segment, BorderType) or # always crop below page level 'cropped' in segment_coords['features'])): # only shift coordinates as if re-cropping segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords) @@ -1210,6 +1219,7 @@ def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xyw -segment_bbox[1]])) return segment_image, segment_coords, segment_xywh + def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs): # Resize linearly segment_coords['transform'] = scale_coordinates( @@ -1218,7 +1228,7 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_xywh['w'] *= factor segment_xywh['h'] *= factor # resize, if (still) necessary - if not 'scaled' in segment_coords['features']: + if 'scaled' not in segment_coords['features']: log.debug("Scaling %s by %.2f", name, factor) segment_coords['features'] += ',scaled' # FIXME: validate factor against PAGE-XML attributes diff --git a/src/ocrd/workspace_backup.py b/src/ocrd/workspace_backup.py index 87ee884bd..a960aadb8 100644 --- a/src/ocrd/workspace_backup.py +++ b/src/ocrd/workspace_backup.py @@ -10,9 +10,11 @@ from .constants import BACKUP_DIR + def _chksum(s): return hashlib.sha256(s).hexdigest() + class WorkspaceBackup(): @classmethod @@ -37,6 +39,7 @@ def __str__(self): self.mets_xml.file_groups ) + class WorkspaceBackupManager(): """ Manages backups of a workspace in a directory BACKUP_DIR diff --git a/src/ocrd/workspace_bagger.py b/src/ocrd/workspace_bagger.py index a30dbfb02..37b1e3fc1 100644 --- a/src/ocrd/workspace_bagger.py +++ b/src/ocrd/workspace_bagger.py @@ -1,5 +1,5 @@ from datetime import datetime -from os import makedirs, chdir, walk +from os import makedirs, walk from os.path import join, isdir, basename as os_path_basename, exists, relpath from pathlib import Path from shutil import make_archive, rmtree, copyfile, move, copytree @@ -7,7 +7,11 @@ import re import tempfile import sys -from bagit import Bag, make_manifests, _load_tag_file, _make_tag_file, _make_tagmanifest_file # pylint: disable=no-name-in-module +from bagit import ( + Bag, + make_manifests, + _load_tag_file, _make_tag_file, _make_tagmanifest_file, # pylint: disable=no-name-in-module +) from ocrd_utils import ( pushd_popd, @@ -25,10 +29,11 @@ from .workspace import Workspace -tempfile.tempdir = '/tmp' # TODO hard-coded +tempfile.tempdir = '/tmp' # TODO hard-coded BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup') + class WorkspaceBagger(): """ Serialize/De-serialize from OCRD-ZIP to workspace and back. @@ -50,7 +55,7 @@ def _serialize_bag(self, workspace, bagdir, dest, skip_zip): def _log_or_raise(self, msg): log = getLogger('ocrd.workspace_bagger') if self.strict: - raise(Exception(msg)) + raise Exception(msg) else: log.info(msg) @@ -112,10 +117,11 @@ def _bag_mets_files( log.info("New vs. old: %s" % changed_local_filenames) return total_bytes, total_files - def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, ocrd_mets=DEFAULT_METS_BASENAME): + def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_base_version_checksum, + ocrd_mets=DEFAULT_METS_BASENAME): bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % ( - VERSION, # TODO + VERSION, # TODO dist_version('ocrd-fork-bagit'), dist_version('ocrd-fork-bagit_profile'), ' '.join(sys.argv)) @@ -139,7 +145,7 @@ def bag(self, tag_files=None, include_fileGrp=None, exclude_fileGrp=None, - ): + ): """ Bag a workspace @@ -178,7 +184,8 @@ def bag(self, f.write(BAGIT_TXT.encode('utf-8')) # create manifests - total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_fileGrp, exclude_fileGrp) + total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, + include_fileGrp, exclude_fileGrp) # create bag-info.txt bag = Bag(bagdir) diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 3f7d675f8..fa4ccea08 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -40,7 +40,8 @@ def exif_from_filename(image_filename): ocrd_exif = OcrdExif(pil_img) return ocrd_exif -def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: + +def page_from_image(input_file: Union[OcrdFile, ClientSideOcrdFile], **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` @@ -73,10 +74,11 @@ def page_from_image(input_file : Union[OcrdFile, ClientSideOcrdFile], **kwargs) pcGtsId=input_file.ID ) mapping = {} - etree : ET._Element = pcgts.to_etree(mapping_=mapping) + etree: ET._Element = pcgts.to_etree(mapping_=mapping) revmap = dict(((node, element) for element, node in mapping.items())) return OcrdPage(pcgts, etree, mapping, revmap) + def page_from_file(input_file, **kwargs) -> OcrdPage: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` diff --git a/src/ocrd_models/constants.py b/src/ocrd_models/constants.py index f7f71f8f8..f2914dcc9 100644 --- a/src/ocrd_models/constants.py +++ b/src/ocrd_models/constants.py @@ -5,7 +5,7 @@ from enum import Enum, auto from dataclasses import dataclass, field from abc import ABC, abstractmethod -from typing import Any, List, Optional, Union +from typing import Any, List, Union from ocrd_utils import resource_string __all__ = [ @@ -107,14 +107,17 @@ class METS_PAGE_DIV_ATTRIBUTE(Enum): @classmethod def names(cls): return [x.name for x in cls] + @classmethod def type_prefix(cls): """disambiguation prefix to use for all subtypes""" return "physical:" + def prefix(self): """disambiguation prefix to use for this attribute type""" return self.type_prefix() + self.name.lower() + ":" + class METS_STRUCT_DIV_ATTRIBUTE(Enum): """page selection attributes of LOGICAL mets:structMap//mets:div""" ID = auto() @@ -125,14 +128,17 @@ class METS_STRUCT_DIV_ATTRIBUTE(Enum): @classmethod def names(cls): return [x.name for x in cls] + @classmethod def type_prefix(cls): """disambiguation prefix to use for all subtypes""" return "logical:" + def prefix(self): """disambiguation prefix to use for this attribute type""" return self.type_prefix() + self.name.lower() + ":" + @dataclass class METS_DIV_ATTRIBUTE_PATTERN(ABC): """page selection pattern (abstract supertype)""" @@ -161,22 +167,27 @@ def attr_prefix(self): @abstractmethod def _matches(self, input) -> bool: return + def matches(self, input) -> bool: """does the selection pattern match on the given attribute value?""" if (matched := self._matches(input)): self.has_matched = True return matched + @dataclass class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for literal (single value) matching""" expr: str + def __repr__(self): return "%s%s" % (self.attr_prefix(), self.expr) + def _matches(self, input): return input == self.expr + @dataclass class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for interval (list expansion) matching""" @@ -186,20 +197,26 @@ class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """first value of the range after expansion, before matching-exhausting""" stop: str = field(init=False) """last value of the range after expansion, before matching-exhausting""" + def __post_init__(self): self.start = self.expr[0] self.stop = self.expr[-1] + def __repr__(self): return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop) + def _matches(self, input): return input in self.expr + @dataclass class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): """page selection pattern for regular expression matching""" expr: Pattern + def __repr__(self): return "%s//%s" % (self.attr_prefix(), self.expr.pattern) + def _matches(self, input): return bool(self.expr.fullmatch(input)) diff --git a/src/ocrd_models/ocrd_agent.py b/src/ocrd_models/ocrd_agent.py index 5a70e5b76..f530c3291 100644 --- a/src/ocrd_models/ocrd_agent.py +++ b/src/ocrd_models/ocrd_agent.py @@ -5,6 +5,7 @@ from .constants import NAMESPACES as NS, TAG_METS_AGENT, TAG_METS_NAME, TAG_METS_NOTE from .ocrd_xml_base import ET + class OcrdAgent(): """ Represents a @@ -195,4 +196,3 @@ def __str__(self): for k in ['type', 'othertype', 'role', 'otherrole', 'name'] ]) return '' - diff --git a/src/ocrd_models/ocrd_exif.py b/src/ocrd_models/ocrd_exif.py index 937416f5e..07023c8a6 100644 --- a/src/ocrd_models/ocrd_exif.py +++ b/src/ocrd_models/ocrd_exif.py @@ -8,6 +8,7 @@ from shutil import which from ocrd_utils import getLogger + class OcrdExif(): """Represents technical image metadata. @@ -42,18 +43,21 @@ def __init__(self, img): if which('identify'): self.run_identify(img) else: - getLogger('ocrd.exif').warning("ImageMagick 'identify' not available, Consider installing ImageMagick for more robust pixel density estimation") + getLogger('ocrd.exif').warning("ImageMagick 'identify' not available, " + "Consider installing ImageMagick for more robust pixel density estimation") self.run_pil(img) def run_identify(self, img): for prop in ['compression', 'photometric_interpretation']: setattr(self, prop, img.info[prop] if prop in img.info else None) if img.filename: - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], check=False, stderr=PIPE, stdout=PIPE) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', img.filename], + check=False, stderr=PIPE, stdout=PIPE) else: with BytesIO() as bio: img.save(bio, format=img.format) - ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) + ret = run(['identify', '-format', r'%[resolution.x] %[resolution.y] %U ', '/dev/stdin'], + check=False, stderr=PIPE, stdout=PIPE, input=bio.getvalue()) if ret.returncode: stderr = ret.stderr.decode('utf-8') if 'no decode delegate for this image format' in stderr: diff --git a/src/ocrd_models/ocrd_file.py b/src/ocrd_models/ocrd_file.py index 91eac8d8e..bfd4b2f2d 100644 --- a/src/ocrd_models/ocrd_file.py +++ b/src/ocrd_models/ocrd_file.py @@ -6,9 +6,10 @@ from ocrd_utils import deprecation_warning -from .ocrd_xml_base import ET # type: ignore +from .ocrd_xml_base import ET # type: ignore from .constants import NAMESPACES as NS, TAG_METS_FLOCAT + class OcrdFile(): """ Represents a single ``mets:file/mets:FLocat`` (METS file entry). @@ -62,11 +63,12 @@ def __str__(self): return ' ' % (fileGrp, props) def __eq__(self, other): - return self.ID == other.ID \ - and self.url == other.url \ - and self.local_filename == other.local_filename - # EXT_TO_MIME[MIME_TO_EXT[self.mimetype]] == EXT_TO_MIME[MIME_TO_EXT[other.mimetype]] and \ - # self.fileGrp == other.fileGrp + return (self.ID == other.ID and + self.url == other.url and + self.local_filename == other.local_filename # and + # EXT_TO_MIME[MIME_TO_EXT[self.mimetype]] == EXT_TO_MIME[MIME_TO_EXT[other.mimetype]] and + # self.fileGrp == other.fileGrp + ) @property def basename(self) -> str: @@ -100,7 +102,7 @@ def ID(self) -> str: return self._el.get('ID') @ID.setter - def ID(self, ID : Optional[str]) -> None: + def ID(self, ID: Optional[str]) -> None: """ Set the ``@ID`` of the ``mets:file`` to :py:attr:`ID`. """ @@ -117,16 +119,18 @@ def ID(self, ID : Optional[str]) -> None: @property def pageId(self) -> str: """ - Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation). + Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` + (physical page manifestation). """ if self.mets is None: raise Exception("OcrdFile %s has no member 'mets' pointing to parent OcrdMets" % self) return self.mets.get_physical_page_for_file(self) @pageId.setter - def pageId(self, pageId : Optional[str]) -> None: + def pageId(self, pageId: Optional[str]) -> None: """ - Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` (physical page manifestation) to :py:attr:`pageId`. + Get the ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` + (physical page manifestation) to :py:attr:`pageId`. """ if pageId is None: return @@ -139,7 +143,7 @@ def loctypes(self) -> List[str]: """ Get the ``@LOCTYPE``s of the ``mets:file``. """ - return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)] + return [x.get('LOCTYPE') for x in self._el.findall('mets:FLocat', NS)] @property def mimetype(self) -> str: @@ -149,7 +153,7 @@ def mimetype(self) -> str: return self._el.get('MIMETYPE') @mimetype.setter - def mimetype(self, mimetype : Optional[str]) -> None: + def mimetype(self, mimetype: Optional[str]) -> None: """ Set the ``@MIMETYPE`` of the ``mets:file`` to :py:attr:`mimetype`. """ @@ -178,7 +182,7 @@ def url(self) -> str: return '' @url.setter - def url(self, url : Optional[str]) -> None: + def url(self, url: Optional[str]) -> None: """ Set the remote/original URL ``@xlink:href`` of this ``mets:file`` to :py:attr:`url`. """ @@ -203,7 +207,7 @@ def local_filename(self) -> Optional[str]: return None @local_filename.setter - def local_filename(self, fname : Optional[Union[Path, str]]): + def local_filename(self, fname: Optional[Union[Path, str]]): """ Set the local/cached ``@xlink:href`` of this ``mets:file`` to :py:attr:`local_filename`. """ @@ -230,12 +234,12 @@ class ClientSideOcrdFile: def __init__( self, - el, # pylint: disable=unused-argument + el, # pylint: disable=unused-argument mimetype: str = '', pageId: str = '', - loctype: str ='OTHER', + loctype: str = 'OTHER', local_filename: Optional[str] = None, - mets : Any = None, # pylint: disable=unused-argument + mets: Any = None, # pylint: disable=unused-argument url: str = '', ID: str = '', fileGrp: str = '' @@ -248,8 +252,8 @@ def __init__( mimetype (string): ``@MIMETYPE`` of this ``mets:file`` pageId (string): ``@ID`` of the physical ``mets:structMap`` entry corresponding to this ``mets:file`` loctype (string): ``@LOCTYPE`` of this ``mets:file`` - url (string): ignored XXX the remote/original file once we have proper mets:FLocat bookkeeping - local_filename (): ``@xlink:href`` of this ``mets:file`` - XXX the local file once we have proper mets:FLocat bookkeeping + url (string): ``@xlink:href`` of this ``mets:file`` (if ``@LOCTYPE==URL``) + local_filename (): ``@xlink:href`` of this ``mets:file`` (if ``@LOCTYPE==FILE @OTHERLOCTYPE==FILE``) ID (string): ``@ID`` of this ``mets:file`` """ self.ID = ID @@ -267,4 +271,5 @@ def __str__(self): ]) return '' % (props) + OcrdFileType = Union[OcrdFile, ClientSideOcrdFile] diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 152ab49de..1dc1b2c1e 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -3,7 +3,6 @@ """ from datetime import datetime import re -from lxml import etree as ET from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union from ocrd_utils import ( @@ -37,44 +36,45 @@ METS_DIV_ATTRIBUTE_REGEX_PATTERN, ) -from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore +from .ocrd_xml_base import OcrdXmlDocument, ET # type: ignore from .ocrd_file import OcrdFile from .ocrd_agent import OcrdAgent REGEX_PREFIX_LEN = len(REGEX_PREFIX) + class OcrdMets(OcrdXmlDocument): """ API to a single METS file """ - _cache_flag : bool + _cache_flag: bool # Cache for the physical pages (mets:div) - two nested dictionaries # The outer dictionary's key: attribute type # The outer dictionary's value: inner dictionary # The inner dictionary's key: attribute value (str) # The inner dictionary's value: a 'div' object at some memory location - _page_cache : Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]] + _page_cache: Dict[METS_PAGE_DIV_ATTRIBUTE, Dict[str, ET._Element]] # Cache for the files (mets:file) - two nested dictionaries # The outer dictionary's Key: 'fileGrp.USE' # The outer dictionary's Value: Inner dictionary # The inner dictionary's Key: 'file.ID' # The inner dictionary's Value: a 'file' object at some memory location - _file_cache : Dict[str, Dict[str, ET._Element]] + _file_cache: Dict[str, Dict[str, ET._Element]] # Cache for the file pointers (mets:fptr) - two nested dictionaries # The outer dictionary's Key: 'div.ID' # The outer dictionary's Value: Inner dictionary # The inner dictionary's Key: 'fptr.FILEID' # The inner dictionary's Value: a 'fptr' object at some memory location - _fptr_cache : Dict[str, Dict[str, ET._Element]] + _fptr_cache: Dict[str, Dict[str, ET._Element]] # Cache for the logical structural divs (mets:div) - two nested dictionaries # The outer dictionary's key: attribute type # The outer dictionary's value: inner dictionary # The inner dictionary's key: attribute value (str) # The inner dictionary's value: a list of corresponding physical div.ID - _struct_cache : Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]] + _struct_cache: Dict[METS_STRUCT_DIV_ATTRIBUTE, Dict[str, List[str]]] @staticmethod - def empty_mets(now : Optional[str] = None, cache_flag : bool = False): + def empty_mets(now: Optional[str] = None, cache_flag: bool = False): """ Create an empty METS file from bundled template. """ @@ -94,11 +94,11 @@ def __init__(self, **kwargs) -> None: # then enable caching, if "false", disable caching, overriding the # kwarg to the constructor if config.is_set('OCRD_METS_CACHING'): - getLogger('ocrd.models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s', - 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING')) + getLogger('ocrd.models.ocrd_mets').debug( + 'METS Caching %s because OCRD_METS_CACHING is %s', + 'enabled' if config.OCRD_METS_CACHING else 'disabled', config.raw_value('OCRD_METS_CACHING')) self._cache_flag = config.OCRD_METS_CACHING - # If cache is enabled if self._cache_flag: self._initialize_caches() @@ -109,7 +109,7 @@ def __str__(self) -> str: String representation """ return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % ( - self._cache_flag, self.file_groups, list(self.find_files())) + self._cache_flag, self.file_groups, list(self.find_files())) def _fill_caches(self) -> None: """ @@ -181,9 +181,9 @@ def _fill_caches(self) -> None: def _initialize_caches(self) -> None: self._file_cache = {} # NOTE we can only guarantee uniqueness for @ID and @ORDER - self._page_cache = {k : {} for k in METS_PAGE_DIV_ATTRIBUTE} + self._page_cache = {k: {} for k in METS_PAGE_DIV_ATTRIBUTE} self._fptr_cache = {} - self._struct_cache = {k : {} for k in METS_STRUCT_DIV_ATTRIBUTE} + self._struct_cache = {k: {} for k in METS_STRUCT_DIV_ATTRIBUTE} def _refresh_caches(self) -> None: if self._cache_flag: @@ -205,7 +205,7 @@ def unique_identifier(self) -> Optional[str]: return found.text @unique_identifier.setter - def unique_identifier(self, purl : str) -> None: + def unique_identifier(self, purl: str) -> None: """ Set the unique identifier by looking through ``mods:identifier`` See `specs `_ for details. @@ -268,15 +268,15 @@ def find_all_files(self, *args, **kwargs) -> List[OcrdFile]: # pylint: disable=multiple-statements def find_files( self, - ID : Optional[str] = None, - fileGrp : Optional[str] = None, - pageId : Optional[str] = None, - mimetype : Optional[str] = None, - url : Optional[str] = None, - local_filename : Optional[str] = None, - local_only : bool = False, - include_fileGrp : Optional[List[str]] = None, - exclude_fileGrp : Optional[List[str]] = None, + ID: Optional[str] = None, + fileGrp: Optional[str] = None, + pageId: Optional[str] = None, + mimetype: Optional[str] = None, + url: Optional[str] = None, + local_filename: Optional[str] = None, + local_only: bool = False, + include_fileGrp: Optional[List[str]] = None, + exclude_fileGrp: Optional[List[str]] = None, ) -> Iterator[OcrdFile]: """ Search ``mets:file`` entries in this METS document and yield results. @@ -346,24 +346,30 @@ def find_files( for cand in candidates: if ID: if isinstance(ID, str): - if not ID == cand.get('ID'): continue + if not ID == cand.get('ID'): + continue else: - if not ID.fullmatch(cand.get('ID')): continue + if not ID.fullmatch(cand.get('ID')): + continue if pageId is not None and cand.get('ID') not in pageId_list: continue if not self._cache_flag and fileGrp: if isinstance(fileGrp, str): - if cand.getparent().get('USE') != fileGrp: continue + if cand.getparent().get('USE') != fileGrp: + continue else: - if not fileGrp.fullmatch(cand.getparent().get('USE')): continue + if not fileGrp.fullmatch(cand.getparent().get('USE')): + continue if mimetype: if isinstance(mimetype, str): - if cand.get('MIMETYPE') != mimetype: continue + if cand.get('MIMETYPE') != mimetype: + continue else: - if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue + if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): + continue if url: cand_locat = cand.find('mets:FLocat[@LOCTYPE="URL"]', namespaces=NS) @@ -371,9 +377,11 @@ def find_files( continue cand_url = cand_locat.get('{%s}href' % NS['xlink']) if isinstance(url, str): - if cand_url != url: continue + if cand_url != url: + continue else: - if not url.fullmatch(cand_url): continue + if not url.fullmatch(cand_url): + continue if local_filename: cand_locat = cand.find('mets:FLocat[@LOCTYPE="OTHER"][@OTHERLOCTYPE="FILE"]', namespaces=NS) @@ -381,9 +389,11 @@ def find_files( continue cand_local_filename = cand_locat.get('{%s}href' % NS['xlink']) if isinstance(local_filename, str): - if cand_local_filename != local_filename: continue + if cand_local_filename != local_filename: + continue else: - if not local_filename.fullmatch(cand_local_filename): continue + if not local_filename.fullmatch(cand_local_filename): + continue if local_only: # deprecation_warning("'local_only' is deprecated, use 'local_filename=\"//.+\"' instead") @@ -435,7 +445,7 @@ def rename_file_group(self, old: str, new: str) -> None: if self._cache_flag: self._file_cache[new] = self._file_cache.pop(old) - def remove_file_group(self, USE: str, recursive : bool = False, force : bool = False) -> None: + def remove_file_group(self, USE: str, recursive: bool = False, force: bool = False) -> None: """ Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``) Arguments: @@ -479,16 +489,16 @@ def remove_file_group(self, USE: str, recursive : bool = False, force : bool = F if self._cache_flag: # Note: Since the files inside the group are removed - # with the 'remove_one_file' method above, + # with the 'remove_one_file' method above, # we should not take care of that again. # We just remove the fileGrp. del self._file_cache[el_fileGrp.get('USE')] el_fileGrp.getparent().remove(el_fileGrp) - def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optional[str] = None, - ID : Optional[str] = None, pageId : Optional[str] = None, force : bool = False, - local_filename : Optional[str] = None, ignore : bool = False, **kwargs) -> OcrdFile: + def add_file(self, fileGrp: str, mimetype: Optional[str] = None, url: Optional[str] = None, + ID: Optional[str] = None, pageId: Optional[str] = None, force: bool = False, + local_filename: Optional[str] = None, ignore: bool = False, **kwargs) -> OcrdFile: """ Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: @@ -499,7 +509,8 @@ def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optiona ID (string): ``@ID`` of the ``mets:file`` to use pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists. - ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user. + ignore (boolean): Do not look for existing files at all. + (Shifts responsibility for preventing errors from duplicate ID to the user.) local_filename (string): """ if not ID: @@ -541,7 +552,7 @@ def add_file(self, fileGrp : str, mimetype : Optional[str] = None, url : Optiona return mets_file - def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile],OcrdFile]: + def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile], OcrdFile]: """ Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files` """ @@ -559,12 +570,14 @@ def remove_file(self, *args, **kwargs) -> Union[List[OcrdFile],OcrdFile]: return [] raise FileNotFoundError("File not found: %s %s" % (args, kwargs)) - def remove_one_file(self, ID : Union[str, OcrdFile], fileGrp : str = None) -> OcrdFile: + def remove_one_file(self, ID: Union[str, OcrdFile], fileGrp: str = None) -> OcrdFile: """ Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`. Arguments: - ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``. - fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization. + ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete. + (Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.) + fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. + (Used only for optimization.) Returns: The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference. """ @@ -629,8 +642,8 @@ def physical_pages(self) -> List[str]: 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID', namespaces=NS)] - def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageIds : Optional[str] = None, - return_divs : bool = False) -> List[Union[str, ET._Element]]: + def get_physical_pages(self, for_fileIds: Optional[List[str]] = None, for_pageIds: Optional[str] = None, + return_divs: bool = False) -> List[Union[str, ET._Element]]: """ List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``), optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`, @@ -718,7 +731,7 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI if for_fileIds == []: return [] - assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright + assert for_fileIds # at this point we know for_fileIds is set, assert to convince pyright ret = [None] * len(for_fileIds) if self._cache_flag: for pageId, fptrdict in self._fptr_cache.items(): @@ -793,7 +806,6 @@ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE val = struct_cache[attr].setdefault(str(el_div.get(attr.name)), list()) val.extend(smlink_map.get(el_div.get('ID'), [])) log.debug("found %d smLink entries for %d logical divs", len(el_smlink_list), len(el_struct_list)) - page_attr_patterns_matched = [] for page in self._tree.getroot().xpath( 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', namespaces=NS): @@ -811,7 +823,7 @@ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE METS_STRUCT_DIV_ATTRIBUTE.LABEL]): continue if cache_keys := [v for v in cache if pat.matches(v)]: - pat.attr = [attr] # disambiguate next + pat.attr = [attr] # disambiguate next if isinstance(attr, METS_PAGE_DIV_ATTRIBUTE): ret.append(page) log.debug('physical match for %s on page %s', pat, page.get('ID')) @@ -835,7 +847,7 @@ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE pat.expr.remove(cache_key) if not pat.expr: patterns_exhausted.append(pat) - break # no more attributes for this pattern + break # no more attributes for this pattern # keep matching in order to exhaust and consume pattern list #if page in ret: # break # no more patterns for this page @@ -847,7 +859,7 @@ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE raise ValueError(f"Patterns {unmatched} match none of the pages") ranges_without_start_match = [] - ranges_without_stop_match = [] + # ranges_without_stop_match = [] for pat in page_attr_patterns_copy: if isinstance(pat, METS_DIV_ATTRIBUTE_RANGE_PATTERN): # range expression, expanded to pattern list @@ -865,8 +877,8 @@ def get_physical_page_patterns(self, page_attr_patterns: List[METS_DIV_ATTRIBUTE # raise ValueError(f"End of range patterns {ranges_without_stop_match} not matched - invalid range") return ret - def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, - order : Optional[str] = None, orderlabel : Optional[str] = None) -> None: + def set_physical_page_for_file(self, pageId: str, ocrd_file: OcrdFile, + order: Optional[str] = None, orderlabel: Optional[str] = None) -> None: """ Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary. @@ -887,7 +899,10 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, fptrs.append(fptrdict[ocrd_file.ID]) else: fptrs = self._tree.getroot().findall( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % + 'mets:structMap[@TYPE="PHYSICAL"]/' + 'mets:div[@TYPE="physSequence"]/' + 'mets:div[@TYPE="page"]/' + 'mets:fptr[@FILEID="%s"]' % ocrd_file.ID, namespaces=NS) for el_fptr in fptrs: @@ -923,7 +938,7 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, if self._cache_flag: # Create a new entry in the page cache self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID][pageId] = el_pagediv - # Create a new entry in the fptr cache and + # Create a new entry in the fptr cache and # assign an empty dictionary to hold the fileids self._fptr_cache.setdefault(pageId, {}) @@ -934,7 +949,7 @@ def set_physical_page_for_file(self, pageId : str, ocrd_file : OcrdFile, # Assign the ocrd fileID to the pageId in the cache self._fptr_cache[pageId].update({ocrd_file.ID: el_fptr}) - def update_physical_page_attributes(self, page_id : str, **kwargs) -> None: + def update_physical_page_attributes(self, page_id: str, **kwargs) -> None: invalid_keys = list(k for k in kwargs if k not in METS_PAGE_DIV_ATTRIBUTE.names()) if invalid_keys: raise ValueError(f"Invalid attribute {invalid_keys}. Allowed values: {METS_PAGE_DIV_ATTRIBUTE.names()}") @@ -950,7 +965,7 @@ def update_physical_page_attributes(self, page_id : str, **kwargs) -> None: else: page_div.attrib[k] = v - def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]: + def get_physical_page_for_file(self, ocrd_file: OcrdFile) -> Optional[str]: """ Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry) corresponding to the ``mets:file`` :py:attr:`ocrd_file`. @@ -961,12 +976,15 @@ def get_physical_page_for_file(self, ocrd_file : OcrdFile) -> Optional[str]: return pageId else: ret = self._tree.getroot().find( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % + 'mets:structMap[@TYPE="PHYSICAL"]/' + 'mets:div[@TYPE="physSequence"]/' + 'mets:div[@TYPE="page"]/' + 'mets:fptr[@FILEID="%s"]' % ocrd_file.ID, namespaces=NS) if ret is not None: return ret.getparent().get('ID') - def remove_physical_page(self, ID : str) -> None: + def remove_physical_page(self, ID: str) -> None: """ Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`. """ @@ -987,9 +1005,11 @@ def remove_physical_page(self, ID : str) -> None: del self._page_cache[attr][mets_div_attrib[attr.name]] del self._fptr_cache[ID] - def remove_physical_page_fptr(self, fileId : str) -> List[str]: + def remove_physical_page_fptr(self, fileId: str) -> List[str]: """ - Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``. + Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` + for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``. + Returns: List of pageIds that mets:fptrs were deleted from """ @@ -1006,7 +1026,10 @@ def remove_physical_page_fptr(self, fileId : str) -> List[str]: mets_fptrs.append(fptrdict[fileId]) else: mets_fptrs = self._tree.getroot().xpath( - 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, + 'mets:structMap[@TYPE="PHYSICAL"]/' + 'mets:div[@TYPE="physSequence"]/' + 'mets:div[@TYPE="page"]/' + 'mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS) ret = [] for mets_fptr in mets_fptrs: @@ -1029,11 +1052,11 @@ def physical_pages_labels(self) -> Dict[str, Tuple[Optional[str], Optional[str], return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None)) for div in divs} - def merge(self, other_mets, force : bool = False, - fileGrp_mapping : Optional[Dict[str, str]] = None, - fileId_mapping : Optional[Dict[str, str]] = None, - pageId_mapping : Optional[Dict[str, str]] = None, - after_add_cb : Optional[Callable[[OcrdFile], Any]] = None, **kwargs) -> None: + def merge(self, other_mets, force: bool = False, + fileGrp_mapping: Optional[Dict[str, str]] = None, + fileId_mapping: Optional[Dict[str, str]] = None, + pageId_mapping: Optional[Dict[str, str]] = None, + after_add_cb: Optional[Callable[[OcrdFile], Any]] = None, **kwargs) -> None: """ Add all files from other_mets. Accepts the same kwargs as :py:func:`find_files` diff --git a/src/ocrd_models/ocrd_page.py b/src/ocrd_models/ocrd_page.py index 046606100..f6283acb2 100644 --- a/src/ocrd_models/ocrd_page.py +++ b/src/ocrd_models/ocrd_page.py @@ -179,6 +179,7 @@ """ ) + class OcrdPage(): """ Proxy object for :py:class:`ocrd_models.PcGtsType` (i.e. PRImA PAGE-XML @@ -188,10 +189,10 @@ class OcrdPage(): """ def __init__( self, - pcgts : PcGtsType, - etree : ET._Element, - mapping : Dict[str, ET._Element], - revmap : Dict[ET._Element, Any], + pcgts: PcGtsType, + etree: ET._Element, + mapping: Dict[str, ET._Element], + revmap: Dict[ET._Element, Any], ): self._pcgts = pcgts self.etree = etree @@ -214,8 +215,10 @@ def __init__( def __getattr__(self, name): return getattr(self._pcgts, name) + OcrdPageType = Union[OcrdPage, PcGtsType] + def to_xml(el, skip_declaration=False) -> str: """ Serialize ``pc:PcGts`` document as string. @@ -229,15 +232,16 @@ def to_xml(el, skip_declaration=False) -> str: name = 'PcGts' sio = StringIO() el.export( - outfile=sio, - level=0, - name_=name, - namespaceprefix_='pc:', - namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( - NAMESPACES['page'], - NAMESPACES['page'], - NAMESPACES['page'] - )) + outfile=sio, + level=0, + name_=name, + namespaceprefix_='pc:', + namespacedef_='xmlns:pc="%s" ' % NAMESPACES['page'] + + 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' + + 'xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( + NAMESPACES['page'], + NAMESPACES['page'] + )) ret = sio.getvalue() if not skip_declaration: ret = '\n' + ret diff --git a/src/ocrd_models/ocrd_xml_base.py b/src/ocrd_models/ocrd_xml_base.py index ea4798c5b..446209486 100644 --- a/src/ocrd_models/ocrd_xml_base.py +++ b/src/ocrd_models/ocrd_xml_base.py @@ -11,6 +11,7 @@ for curie, url in NAMESPACES.items(): ET.register_namespace(curie, url) + class OcrdXmlDocument(): """ Base class for XML documents loaded from either content or filename. diff --git a/src/ocrd_models/report.py b/src/ocrd_models/report.py index 1b89d8d7e..0f058d9d9 100644 --- a/src/ocrd_models/report.py +++ b/src/ocrd_models/report.py @@ -7,7 +7,8 @@ # ------------------------------------------------- # -class ValidationReport(object): + +class ValidationReport(): """ Container of notices, warnings and errors about a workspace. """ diff --git a/src/ocrd_models/utils.py b/src/ocrd_models/utils.py index 4fa0653c8..8f7912687 100644 --- a/src/ocrd_models/utils.py +++ b/src/ocrd_models/utils.py @@ -13,6 +13,7 @@ 'extract_mets_from_oai_content' ] + def xmllint_format(xml): """ Pretty-print XML like ``xmllint`` does. @@ -25,6 +26,7 @@ def xmllint_format(xml): return ('%s\n%s' % ('', ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8') + def handle_oai_response(response): """ In case of a valid OAI-Response, extract first METS-Entry-Data @@ -62,9 +64,8 @@ def extract_mets_from_oai_content(data, preamble=' JobState: +def _poll_endpoint_status(ps_server_host: str, job_id: str, job_type: str, tries: int, wait: int, + print_state: bool = False) -> JobState: if job_type not in ["workflow", "processor"]: raise ValueError(f"Unknown job type '{job_type}', expected 'workflow' or 'processor'") job_state = JobState.unset @@ -60,6 +61,7 @@ def get_ps_processing_job_status(ps_server_host: str, processing_job_id: str) -> assert job_state return getattr(JobState, job_state.lower()) + def get_ps_workflow_job_status(ps_server_host: str, workflow_job_id: str) -> JobState: request_url = f"{ps_server_host}/workflow/job-simple/{workflow_job_id}" response = request_get(url=request_url, headers={"accept": "application/json; charset=utf-8"}) diff --git a/src/ocrd_network/constants.py b/src/ocrd_network/constants.py index 1a71e53e1..089e321df 100644 --- a/src/ocrd_network/constants.py +++ b/src/ocrd_network/constants.py @@ -15,6 +15,7 @@ class StrEnum(str, Enum): def __str__(self): return self.value + class AgentType(StrEnum): PROCESSING_WORKER = "worker" PROCESSOR_SERVER = "server" diff --git a/src/ocrd_network/database.py b/src/ocrd_network/database.py index 8b0b48925..a29e5c39d 100644 --- a/src/ocrd_network/database.py +++ b/src/ocrd_network/database.py @@ -65,7 +65,7 @@ async def sync_db_create_workspace(mets_path: str) -> DBWorkspace: async def db_get_workspace(workspace_id: str = None, workspace_mets_path: str = None) -> DBWorkspace: workspace = None if not workspace_id and not workspace_mets_path: - raise ValueError(f'Either `workspace_id` or `workspace_mets_path` field must be used as a search key') + raise ValueError('Either `workspace_id` or `workspace_mets_path` field must be used as a search key') if workspace_id: workspace = await DBWorkspace.find_one( DBWorkspace.workspace_id == workspace_id @@ -89,7 +89,7 @@ async def sync_db_get_workspace(workspace_id: str = None, workspace_mets_path: s async def db_update_workspace(workspace_id: str = None, workspace_mets_path: str = None, **kwargs) -> DBWorkspace: workspace = None if not workspace_id and not workspace_mets_path: - raise ValueError(f'Either `workspace_id` or `workspace_mets_path` field must be used as a search key') + raise ValueError('Either `workspace_id` or `workspace_mets_path` field must be used as a search key') if workspace_id: workspace = await DBWorkspace.find_one(DBWorkspace.workspace_id == workspace_id) if not workspace: @@ -274,4 +274,4 @@ def verify_mongodb_available(mongo_url: str) -> None: client = MongoClient(mongo_url, serverSelectionTimeoutMS=60000.0) client.admin.command("ismaster") except Exception: - raise RuntimeError(f'Cannot connect to MongoDB: {re_sub(r":[^@]+@", ":****@", mongo_url)}') \ No newline at end of file + raise RuntimeError(f'Cannot connect to MongoDB: {re_sub(r":[^@]+@", ":****@", mongo_url)}') diff --git a/src/ocrd_network/processing_server.py b/src/ocrd_network/processing_server.py index b4552d78d..ae364f1b2 100644 --- a/src/ocrd_network/processing_server.py +++ b/src/ocrd_network/processing_server.py @@ -91,7 +91,7 @@ def __init__(self, config_path: str, host: str, port: int) -> None: log_file = get_processing_server_logging_file_path(pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") - self.log.info(f"Loading ocrd all tool json") + self.log.info("Loading ocrd-all-tool.json") self.ocrd_all_tool_json = load_ocrd_all_tool_json() self.hostname = host self.port = port @@ -208,7 +208,8 @@ def add_api_routes_others(self): methods=["DELETE"], tags=[ServerApiTags.WORKFLOW, ServerApiTags.PROCESSING], status_code=status.HTTP_200_OK, - summary="!! Workaround Do Not Use Unless You Have A Reason !! Kill all METS servers on this machine that have been created more than 60 minutes ago." + summary="!! Workaround Do Not Use Unless You Have A Reason " + "!! Kill all METS servers on this machine that have been created more than 60 minutes ago." ) self.include_router(others_router) @@ -825,7 +826,7 @@ async def get_workflow_info(self, workflow_job_id) -> Dict: response = self._produce_workflow_status_response(processing_jobs=jobs) return response - async def kill_mets_server_zombies(self, minutes_ago : Optional[int] = None, dry_run : Optional[bool] = None) -> List[int]: + async def kill_mets_server_zombies(self, minutes_ago: Optional[int] = None, dry_run: Optional[bool] = None) -> List[int]: pids_killed = kill_mets_server_zombies(minutes_ago=minutes_ago, dry_run=dry_run) return pids_killed diff --git a/src/ocrd_network/processing_worker.py b/src/ocrd_network/processing_worker.py index 9e26f1332..1f858d939 100644 --- a/src/ocrd_network/processing_worker.py +++ b/src/ocrd_network/processing_worker.py @@ -9,7 +9,7 @@ """ from datetime import datetime -from os import getpid, getppid +from os import getpid from pika import BasicProperties from pika.adapters.blocking_connection import BlockingChannel from pika.spec import Basic @@ -36,7 +36,7 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: initLogging() - self.log = getLogger(f'ocrd_network.processing_worker') + self.log = getLogger('ocrd_network.processing_worker') log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) configure_file_handler_with_formatter(self.log, log_file=log_file, mode="a") @@ -120,7 +120,7 @@ def on_consumed_message( channel.basic_nack(delivery_tag=delivery_tag, multiple=False, requeue=False) raise Exception(message) - self.log.info(f"Successfully processed RabbitMQ message") + self.log.info("Successfully processed RabbitMQ message") self.log.debug(ack_message) channel.basic_ack(delivery_tag=delivery_tag, multiple=False) @@ -136,7 +136,7 @@ def start_consuming(self) -> None: self.rmq_consumer.start_consuming() self.log.info(f"Consuming stopped for queue: {self.processor_name}") else: - msg = f"The RMQConsumer is not connected/configured properly." + msg = "The RMQConsumer is not connected/configured properly." self.log.exception(msg) raise Exception(msg) @@ -166,7 +166,7 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: parameters = processing_message.parameters if processing_message.parameters else {} if not path_to_mets and not workspace_id: - msg = f"Both 'path_to_mets' and 'workspace_id' are missing in the OcrdProcessingMessage." + msg = "Both 'path_to_mets' and 'workspace_id' are missing in the OcrdProcessingMessage." self.log.exception(msg) raise ValueError(msg) diff --git a/src/ocrd_network/processor_server.py b/src/ocrd_network/processor_server.py index 60674afbf..f873d2857 100644 --- a/src/ocrd_network/processor_server.py +++ b/src/ocrd_network/processor_server.py @@ -45,7 +45,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= super().__init__( on_startup=[self.on_startup], on_shutdown=[self.on_shutdown], - title=f"Network agent - Processor Server", + title="Network agent - Processor Server", description="Network agent - Processor Server" ) initLogging() @@ -63,7 +63,7 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= self.ocrd_tool = self.get_ocrd_tool() if not self.ocrd_tool: - raise Exception(f"The ocrd_tool is empty or missing") + raise Exception("The ocrd_tool is empty or missing") if not self.processor_name: self.processor_name = self.ocrd_tool["executable"] diff --git a/src/ocrd_network/rabbitmq_utils/helpers.py b/src/ocrd_network/rabbitmq_utils/helpers.py index 122658d76..5dc6dae77 100644 --- a/src/ocrd_network/rabbitmq_utils/helpers.py +++ b/src/ocrd_network/rabbitmq_utils/helpers.py @@ -42,7 +42,7 @@ def __connect_rabbitmq_client( def connect_rabbitmq_consumer(logger: Logger, rmq_data: Dict) -> RMQConsumer: rmq_consumer = __connect_rabbitmq_client(logger=logger, client_type="consumer", rmq_data=rmq_data) - logger.info(f"Successfully connected RMQConsumer") + logger.info("Successfully connected RMQConsumer") return rmq_consumer diff --git a/src/ocrd_network/runtime_data/deployer.py b/src/ocrd_network/runtime_data/deployer.py index 919d5b97c..4578e7eb8 100644 --- a/src/ocrd_network/runtime_data/deployer.py +++ b/src/ocrd_network/runtime_data/deployer.py @@ -9,11 +9,10 @@ from __future__ import annotations from pathlib import Path import psutil -from time import sleep from typing import Dict, List, Union from ocrd import OcrdMetsServer -from ocrd_utils import config, getLogger, safe_filename +from ocrd_utils import getLogger from ..logging_utils import get_mets_server_logging_file_path from ..utils import get_uds_path, is_mets_server_running, stop_mets_server from .config_parser import parse_hosts_data, parse_mongodb_data, parse_rabbitmq_data, validate_and_load_config @@ -56,19 +55,19 @@ def find_matching_network_agents( """ if worker_only and server_only: - msg = f"Only 'worker_only' or 'server_only' is allowed, not both." + msg = "Only 'worker_only' or 'server_only' is allowed, not both." self.log.exception(msg) raise ValueError(msg) if docker_only and native_only: - msg = f"Only 'docker_only' or 'native_only' is allowed, not both." + msg = "Only 'docker_only' or 'native_only' is allowed, not both." self.log.exception(msg) raise ValueError(msg) if not str_names_only and unique_only: - msg = f"Value 'unique_only' is allowed only together with 'str_names_only'" + msg = "Value 'unique_only' is allowed only together with 'str_names_only'" self.log.exception(msg) raise ValueError(msg) if sort and not str_names_only: - msg = f"Value 'sort' is allowed only together with 'str_names_only'" + msg = "Value 'sort' is allowed only together with 'str_names_only'" self.log.exception(msg) raise ValueError(msg) @@ -154,7 +153,9 @@ def start_uds_mets_server(self, ws_dir_path: str) -> Path: "Removing to avoid any weird behavior before starting the server.") Path(mets_server_url).unlink() self.log.info(f"Starting UDS mets server: {mets_server_url}") - pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), ws_dir_path=str(ws_dir_path), log_file=str(log_file)) + pid = OcrdMetsServer.create_process(mets_server_url=str(mets_server_url), + ws_dir_path=str(ws_dir_path), + log_file=str(log_file)) self.mets_servers[str(mets_server_url)] = pid self.mets_servers_paths[str(ws_dir_path)] = str(mets_server_url) return mets_server_url diff --git a/src/ocrd_network/runtime_data/hosts.py b/src/ocrd_network/runtime_data/hosts.py index f46a871f6..176afb991 100644 --- a/src/ocrd_network/runtime_data/hosts.py +++ b/src/ocrd_network/runtime_data/hosts.py @@ -109,10 +109,10 @@ def __deploy_network_agent( connection_client = None if deploy_type == DeployType.NATIVE: - assert self.ssh_client, f"SSH client connection missing." + assert self.ssh_client, "SSH client connection missing." connection_client = self.ssh_client if deploy_type == DeployType.DOCKER: - assert self.docker_client, f"Docker client connection missing." + assert self.docker_client, "Docker client connection missing." connection_client = self.docker_client if agent_type == AgentType.PROCESSING_WORKER: @@ -126,7 +126,7 @@ def __deploy_network_agents_workers(self, logger: Logger, mongodb_url: str, rabb logger.info(f"Deploying processing workers on host: {self.host}") amount_workers = len(self.network_agents_worker_native) + len(self.network_agents_worker_docker) if not amount_workers: - logger.info(f"No processing workers found to be deployed") + logger.info("No processing workers found to be deployed") for data_worker in self.network_agents_worker_native: self.__deploy_network_agent(logger, data_worker, mongodb_url, rabbitmq_url) for data_worker in self.network_agents_worker_docker: @@ -136,7 +136,7 @@ def __deploy_network_agents_servers(self, logger: Logger, mongodb_url: str, rabb logger.info(f"Deploying processor servers on host: {self.host}") amount_servers = len(self.network_agents_server_native) + len(self.network_agents_server_docker) if not amount_servers: - logger.info(f"No processor servers found to be deployed") + logger.info("No processor servers found to be deployed") for data_server in self.network_agents_server_native: self.__deploy_network_agent(logger, data_server, mongodb_url, rabbitmq_url) self.__add_deployed_agent_server_port_to_cache(data_server.processor_name, data_server.port) @@ -168,17 +168,17 @@ def __stop_network_agent(self, logger: Logger, name: str, deploy_type: DeployTyp agent_info += f", pid: {pid}" logger.info(f"Stopping {agent_info}") if deploy_type == DeployType.NATIVE: - assert self.ssh_client, f"SSH client connection missing" + assert self.ssh_client, "SSH client connection missing" self.ssh_client.exec_command(f"kill {pid}") if deploy_type == DeployType.DOCKER: - assert self.docker_client, f"Docker client connection missing" + assert self.docker_client, "Docker client connection missing" self.docker_client.containers.get(pid).stop() def __stop_network_agents_workers(self, logger: Logger): logger.info(f"Stopping processing workers on host: {self.host}") amount_workers = len(self.network_agents_worker_native) + len(self.network_agents_worker_docker) if not amount_workers: - logger.warning(f"No active processing workers to be stopped.") + logger.warning("No active processing workers to be stopped.") for worker in self.network_agents_worker_native: self.__stop_network_agent(logger, worker.processor_name, worker.deploy_type, worker.agent_type, worker.pid) self.network_agents_worker_native = [] @@ -190,7 +190,7 @@ def __stop_network_agents_servers(self, logger: Logger): logger.info(f"Stopping processor servers on host: {self.host}") amount_servers = len(self.network_agents_server_native) + len(self.network_agents_server_docker) if not amount_servers: - logger.warning(f"No active processor servers to be stopped.") + logger.warning("No active processor servers to be stopped.") for server in self.network_agents_server_native: self.__stop_network_agent(logger, server.processor_name, server.deploy_type, server.agent_type, server.pid) self.network_agents_server_native = [] diff --git a/src/ocrd_network/runtime_data/network_agents.py b/src/ocrd_network/runtime_data/network_agents.py index b52476826..742f30309 100644 --- a/src/ocrd_network/runtime_data/network_agents.py +++ b/src/ocrd_network/runtime_data/network_agents.py @@ -80,7 +80,7 @@ def deploy_network_agent(self, logger: Logger, connector_client, database_url: s return self.pid if self.deploy_type == DeployType.DOCKER: # TODO: add real command to start processing worker in docker here - start_cmd = f"" + start_cmd = "" self.pid = self._start_docker_instance(logger, connector_client, start_cmd) return self.pid raise RuntimeError(f"Unknown deploy type of {self.__dict__}") @@ -104,7 +104,7 @@ def deploy_network_agent(self, logger: Logger, connector_client, database_url: s return self.pid if self.deploy_type == DeployType.DOCKER: # TODO: add real command to start processor server in docker here - start_cmd = f"" + start_cmd = "" self.pid = self._start_docker_instance(logger, connector_client, start_cmd) return self.pid raise RuntimeError(f"Unknown deploy type of {self.__dict__}") diff --git a/src/ocrd_network/runtime_data/network_services.py b/src/ocrd_network/runtime_data/network_services.py index 3b4c52a0b..e8e930725 100644 --- a/src/ocrd_network/runtime_data/network_services.py +++ b/src/ocrd_network/runtime_data/network_services.py @@ -129,7 +129,7 @@ def deploy_rabbitmq( rmq_host, rmq_port, rmq_vhost = self.host, int(self.port), self.vhost rmq_user, rmq_password = self.cred_username, self.cred_password if self.skip_deployment: - logger.debug(f"RabbitMQ is managed externally. Skipping deployment.") + logger.debug("RabbitMQ is managed externally. Skipping deployment.") verify_rabbitmq_available(logger=logger, rabbitmq_address=self.service_url) return self.service_url if not env: diff --git a/src/ocrd_network/server_cache.py b/src/ocrd_network/server_cache.py index 179a76139..4367db771 100644 --- a/src/ocrd_network/server_cache.py +++ b/src/ocrd_network/server_cache.py @@ -33,7 +33,7 @@ def check_if_locked_pages_for_output_file_grps( if not self.locked_pages.get(workspace_key, None): self.log.info(f"No entry found in the locked pages cache for workspace key: {workspace_key}") return False - debug_message = f"Caching the received request due to locked output file grp pages." + debug_message = "Caching the received request due to locked output file grp pages." for file_group in output_file_grps: if file_group in self.locked_pages[workspace_key]: if self.placeholder_all_pages in self.locked_pages[workspace_key][file_group]: diff --git a/src/ocrd_network/server_utils.py b/src/ocrd_network/server_utils.py index 6e485f261..1cc8c0dd3 100644 --- a/src/ocrd_network/server_utils.py +++ b/src/ocrd_network/server_utils.py @@ -55,7 +55,7 @@ def create_processing_message(logger: Logger, job: DBProcessorJob) -> OcrdProces ) return processing_message except ValueError as error: - message = f"Failed to create OcrdProcessingMessage from DBProcessorJob" + message = "Failed to create OcrdProcessingMessage from DBProcessorJob" raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message, error) @@ -139,6 +139,7 @@ def request_processor_server_tool_json(logger: Logger, processor_server_base_url raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message) return response.json() + async def forward_job_to_processor_server( logger: Logger, job_input: PYJobInput, processor_server_base_url: str ) -> PYJobOutput: @@ -193,14 +194,14 @@ def parse_workflow_tasks(logger: Logger, workflow_content: str) -> List[Processo tasks_list = workflow_content.splitlines() return [ProcessorTask.parse(task_str) for task_str in tasks_list if task_str.strip()] except ValueError as error: - message = f"Failed parsing processing tasks from a workflow." + message = "Failed parsing processing tasks from a workflow." raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message, error) def raise_http_exception(logger: Logger, status_code: int, message: str, error: Exception = None) -> None: if error: message = f"{message} {error}" - logger.exception(f"{message}") + logger.exception(message) raise HTTPException(status_code=status_code, detail=message) @@ -214,7 +215,7 @@ def validate_job_input(logger: Logger, processor_name: str, ocrd_tool: dict, job ) raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) if not ocrd_tool: - message = f"Failed parsing processing tasks from a workflow." + message = "Failed parsing processing tasks from a workflow." raise_http_exception(logger, status.HTTP_404_NOT_FOUND, message) try: report = ParameterValidator(ocrd_tool).validate(dict(job_input.parameters)) @@ -249,10 +250,10 @@ def validate_first_task_input_file_groups_existence(logger: Logger, mets_path: s raise_http_exception(logger, status.HTTP_422_UNPROCESSABLE_ENTITY, message) -def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[bool]) -> List[int]: - if minutes_ago == None: +def kill_mets_server_zombies(minutes_ago: Optional[int], dry_run: Optional[bool]) -> List[int]: + if minutes_ago is None: minutes_ago = 90 - if dry_run == None: + if dry_run is None: dry_run = False now = time() @@ -271,7 +272,8 @@ def kill_mets_server_zombies(minutes_ago : Optional[int], dry_run : Optional[boo if re.match(cmdline_pat, cmdline): pid = int(procdir.name) ret.append(pid) - print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, so killing (cmdline="{cmdline})', file=sys.stderr) + print(f'METS Server with PID {pid} was created {ctime_ago} minutes ago, more than {minutes_ago}, ' + f'so killing (cmdline="{cmdline})', file=sys.stderr) if dry_run: print(f'[dry_run is active] kill {pid}') else: diff --git a/src/ocrd_network/utils.py b/src/ocrd_network/utils.py index 5abe2104f..89df06133 100644 --- a/src/ocrd_network/utils.py +++ b/src/ocrd_network/utils.py @@ -172,5 +172,6 @@ def stop_mets_server(logger: Logger, mets_server_url: str, ws_dir_path: str) -> else: ValueError(f"Unexpected protocol type: {protocol}") + def get_uds_path(ws_dir_path: str) -> Path: return Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(ws_dir_path)}.sock") diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 9cec0b30a..ec5d26ecc 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -6,13 +6,14 @@ import codecs from os.path import dirname, join + # # You must include the following class definition at the top of # your method specification file. # class MethodSpec(): def __init__(self, name='', source='', class_names='', - class_names_compiled=None): + class_names_compiled=None): """MethodSpec -- A specification of a method. Member variables: name -- The method name @@ -33,23 +34,32 @@ class names in which the method is to be inserted. self.class_names_compiled = re.compile(self.class_names) else: self.class_names_compiled = class_names_compiled + def get_name(self): return self.name + def set_name(self, name): self.name = name + def get_source(self): return self.source + def set_source(self, source): self.source = source + def get_class_names(self): return self.class_names + def set_class_names(self, class_names): self.class_names = class_names self.class_names_compiled = re.compile(class_names) + def get_class_names_compiled(self): return self.class_names_compiled + def set_class_names_compiled(self, class_names_compiled): self.class_names_compiled = class_names_compiled + def match_name(self, class_name): """Match against the name of the class currently being generated. If this method returns True, the method will be inserted in @@ -58,6 +68,7 @@ def match_name(self, class_name): if self.class_names_compiled.search(class_name): return True return False + def get_interpolated_source(self, values_dict): """Get the method source code, interpolating values from values_dict into it. The source returned by this method is inserted into @@ -65,6 +76,7 @@ def get_interpolated_source(self, values_dict): """ source = self.source % values_dict return source + def show(self): print('specification:') print(' name: %s' % (self.name, )) @@ -94,6 +106,7 @@ def _add_method(class_re, method_name, file_name=None): source.append(' %s' % line.replace('%', '%%') if line else line) return MethodSpec(name=method_name, class_names=class_re, source=''.join(source)) + # # Provide a list of your method specifications. # This list of specifications must be named METHOD_SPECS. @@ -118,7 +131,9 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'get_AllTextLines'), _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: - _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), + _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType' + r'|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType' + r'|ImageRegionType|TextRegionType)$', 'set_orientation'), ) @@ -126,6 +141,7 @@ def test(): for spec in METHOD_SPECS: spec.show() + def main(): test() diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index c853a34bd..852400f69 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -8,11 +8,11 @@ levels below page (i.e. region, line, word, glyph) between relative coordinates w.r.t. a corresponding image and absolute coordinates w.r.t. the top-level image. This includes rotation and offset correction, based on affine transformations. - (Used by :py:class:`ocrd.workspace.Workspace` methods - :py:meth:`ocrd.workspace.Workspace.image_from_page` and + (Used by :py:class:`ocrd.workspace.Workspace` methods + :py:meth:`ocrd.workspace.Workspace.image_from_page` and :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) -* :py:func:`rotate_coordinates`, +* :py:func:`rotate_coordinates`, :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, @@ -23,7 +23,7 @@ used to pass down the coordinate system along with images (both invariably sharing the same operations context) when traversing the element hierarchy top to bottom. (Used by :py:class:`ocrd.workspace.Workspace` methods - :py:meth:`ocrd.workspace.Workspace.image_from_page` and + :py:meth:`ocrd.workspace.Workspace.image_from_page` and :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_image`, diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index d8096027a..402644af4 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -16,9 +16,11 @@ def _validator_boolean(val): return isinstance(val, bool) or str.lower(val) in ('true', 'false', '0', '1') + def _parser_boolean(val): return bool(val) if isinstance(val, (int, bool)) else str.lower(val) in ('true', '1') + class OcrdEnvVariable(): def __init__(self, name, description, parser=str, validator=lambda _: True, default=[False, None]): @@ -61,7 +63,7 @@ def describe(self, wrap_text=True, indent_text=True): desc += ' ' desc += f'(Default: "{default}")' ret = '' - ret = f'{self.name}\n' + ret = f'{self.name}\n' if wrap_text: desc = fill(desc, width=50) if indent_text: @@ -69,6 +71,7 @@ def describe(self, wrap_text=True, indent_text=True): desc = indent(desc, ' ') return ret + desc + class OcrdEnvConfig(): def __init__(self): @@ -83,7 +86,7 @@ def add(self, name, *args, **kwargs): return self._variables[name] def has_default(self, name): - if not name in self._variables: + if name not in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default @@ -99,13 +102,13 @@ def reset_defaults(self): pass def describe(self, name, *args, **kwargs): - if not name in self._variables: + if name not in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): # will be called if name is not accessible (has not been added directly yet) - if not name in self._variables: + if name not in self._variables: raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: @@ -120,39 +123,45 @@ def __getattr__(self, name): return var_obj.parser(raw_value) def is_set(self, name): - if not name in self._variables: + if name not in self._variables: raise ValueError(f"Unregistered env variable {name}") return name in environ def raw_value(self, name): - if not name in self._variables: + if name not in self._variables: raise ValueError(f"Unregistered env variable {name}") return environ[name] + config = OcrdEnvConfig() config.add('OCRD_METS_CACHING', - description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', - validator=_validator_boolean, - parser=_parser_boolean) + description='If set to `true`, access to the METS file is cached, speeding in-memory search and modification.', + validator=_validator_boolean, + parser=_parser_boolean) config.add('OCRD_MAX_PROCESSOR_CACHE', - description="Maximum number of processor instances (for each set of parameters) to be kept in memory (including loaded models) for processing workers or processor servers.", - parser=int, - default=(True, 128)) + description="Maximum number of processor instances (for each set of parameters) to be kept in memory " + "(including loaded models) for processing workers or processor servers.", + parser=int, + default=(True, 128)) config.add('OCRD_MAX_PARALLEL_PAGES', - description="Maximum number of processor workers for page-parallel processing (within each Processor's selected page range, independent of the number of Processing Workers or Processor Servers). If set >1, then a METS Server must be used for METS synchronisation.", - parser=int, - default=(True, 1)) + description="Maximum number of processor workers for page-parallel processing " + "(within each Processor's selected page range, independent of the number " + "of Processing Workers or Processor Servers). If set >1, then a METS Server " + "must be used for METS synchronisation.", + parser=int, + default=(True, 1)) config.add('OCRD_PROCESSING_PAGE_TIMEOUT', - description="Timeout in seconds for processing a single page. If set >0, when exceeded, the same as OCRD_MISSING_OUTPUT applies.", - parser=int, - default=(True, 0)) + description="Timeout in seconds for processing a single page. If set >0, when exceeded, " + "the same as OCRD_MISSING_OUTPUT applies.", + parser=int, + default=(True, 0)) config.add("OCRD_PROFILE", - description="""\ + description="""\ Whether to enable gathering runtime statistics on the `ocrd.profile` logger (comma-separated): \b @@ -161,16 +170,18 @@ def raw_value(self, name): - `PSS`: also yields peak memory (proportional set size) \b """, - validator=lambda val : all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), - default=(True, '')) + validator=lambda val: all(t in ('', 'CPU', 'RSS', 'PSS') for t in val.split(',')), + default=(True, '')) config.add("OCRD_PROFILE_FILE", - description="If set, then the CPU profile is written to this file for later peruse with a analysis tools like snakeviz") + description="If set, then the CPU profile is written to this file for later peruse " + "with a analysis tools like snakeviz") config.add("OCRD_DOWNLOAD_RETRIES", - description="Number of times to retry failed attempts for downloads of resources or workspace files.", - validator=int, - parser=int) + description="Number of times to retry failed attempts for downloads of resources or workspace files.", + validator=int, + parser=int) + def _ocrd_download_timeout_parser(val): timeout = val.split(',') @@ -180,18 +191,19 @@ def _ocrd_download_timeout_parser(val): timeout = float(timeout[0]) return timeout + config.add("OCRD_DOWNLOAD_TIMEOUT", - description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", - parser=_ocrd_download_timeout_parser) + description="Timeout in seconds for connecting or reading (comma-separated) when downloading.", + parser=_ocrd_download_timeout_parser) config.add("OCRD_DOWNLOAD_INPUT", - description="Whether to download files not present locally during processing", - default=(True, True), - validator=_validator_boolean, - parser=_parser_boolean) + description="Whether to download files not present locally during processing", + default=(True, True), + validator=_validator_boolean, + parser=_parser_boolean) config.add("OCRD_MISSING_INPUT", - description="""\ + description="""\ How to deal with missing input files (for some fileGrp/pageId) during processing: \b @@ -199,12 +211,12 @@ def _ocrd_download_timeout_parser(val): - `ABORT`: throw :py:class:`.MissingInputFile` \b """, - default=(True, 'SKIP'), - validator=lambda val: val in ['SKIP', 'ABORT'], - parser=str) + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'ABORT'], + parser=str) config.add("OCRD_MISSING_OUTPUT", - description="""\ + description="""\ How to deal with missing output files (for some fileGrp/pageId) during processing: \b @@ -213,17 +225,18 @@ def _ocrd_download_timeout_parser(val): - `ABORT`: re-throw whatever caused processing to fail \b """, - default=(True, 'SKIP'), - validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], - parser=str) + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'COPY', 'ABORT'], + parser=str) config.add("OCRD_MAX_MISSING_OUTPUTS", - description="Maximal rate of skipped/fallback pages among all processed pages before aborting (decimal fraction, ignored if negative).", - default=(True, 0.1), - parser=float) + description="Maximal rate of skipped/fallback pages among all processed pages before aborting " + "(decimal fraction, ignored if negative).", + default=(True, 0.1), + parser=float) config.add("OCRD_EXISTING_OUTPUT", - description="""\ + description="""\ How to deal with already existing output files (for some fileGrp/pageId) during processing: \b @@ -232,13 +245,13 @@ def _ocrd_download_timeout_parser(val): - `ABORT`: re-throw :py:class:`FileExistsError` \b """, - default=(True, 'SKIP'), - validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], - parser=str) + default=(True, 'SKIP'), + validator=lambda val: val in ['SKIP', 'OVERWRITE', 'ABORT'], + parser=str) config.add("OCRD_NETWORK_SERVER_ADDR_PROCESSING", - description="Default address of Processing Server to connect to (for `ocrd network client processing`).", - default=(True, '')) + description="Default address of Processing Server to connect to (for `ocrd network client processing`).", + default=(True, '')) config.add("OCRD_NETWORK_CLIENT_POLLING_SLEEP", description="How many seconds to sleep before trying again.", @@ -251,27 +264,25 @@ def _ocrd_download_timeout_parser(val): default=(True, 3600)) config.add("OCRD_NETWORK_SERVER_ADDR_WORKFLOW", - description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", - default=(True, '')) + description="Default address of Workflow Server to connect to (for `ocrd network client workflow`).", + default=(True, '')) config.add("OCRD_NETWORK_SERVER_ADDR_WORKSPACE", - description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).", - default=(True, '')) + description="Default address of Workspace Server to connect to (for `ocrd network client workspace`).", + default=(True, '')) config.add("OCRD_NETWORK_RABBITMQ_CLIENT_CONNECT_ATTEMPTS", description="Number of attempts for a RabbitMQ client to connect before failing.", parser=int, default=(True, 3)) -config.add( - name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", - description=""" - Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value - proposed by broker. Use 0 to deactivate heartbeat. - """, - parser=int, - default=(True, 0) -) +config.add(name="OCRD_NETWORK_RABBITMQ_HEARTBEAT", + description=""" +Controls AMQP heartbeat timeout (in seconds) negotiation during connection tuning. An integer value always overrides the value +proposed by broker. Use 0 to deactivate heartbeat. + """, + parser=int, + default=(True, 0)) config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", description="The root directory where all mets server related socket files are created", @@ -296,24 +307,24 @@ def _ocrd_download_timeout_parser(val): pass config.add("HOME", - description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.", - # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html", - validator=lambda val: Path(val).is_dir(), - parser=lambda val: Path(val), - default=(True, lambda: Path.home())) + description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.", + # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html", + validator=lambda val: Path(val).is_dir(), + parser=lambda val: Path(val), + default=(True, lambda: Path.home())) config.add("XDG_DATA_HOME", - description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", - parser=lambda val: Path(val), - default=(True, lambda: Path(config.HOME, '.local/share'))) + description="Directory to look for `./ocrd-resources/*` (i.e. `ocrd resmgr` data location)", + parser=lambda val: Path(val), + default=(True, lambda: Path(config.HOME, '.local/share'))) config.add("XDG_CONFIG_HOME", - description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", - parser=lambda val: Path(val), - default=(True, lambda: Path(config.HOME, '.config'))) + description="Directory to look for `./ocrd/resources.yml` (i.e. `ocrd resmgr` user database)", + parser=lambda val: Path(val), + default=(True, lambda: Path(config.HOME, '.config'))) config.add("OCRD_LOGGING_DEBUG", - description="Print information about the logging setup to STDERR", - default=(True, False), - validator=_validator_boolean, - parser=_parser_boolean) + description="Print information about the logging setup to STDERR", + default=(True, False), + validator=_validator_boolean, + parser=_parser_boolean) diff --git a/src/ocrd_utils/deprecate.py b/src/ocrd_utils/deprecate.py index d17efeb58..bcb09d637 100644 --- a/src/ocrd_utils/deprecate.py +++ b/src/ocrd_utils/deprecate.py @@ -1,9 +1,11 @@ import functools import warnings + def deprecation_warning(msg, stacklevel=2): warnings.warn(msg, DeprecationWarning, stacklevel) + def deprecated_alias(**aliases): """ Deprecate a kwarg in favor of another kwarg @@ -16,6 +18,7 @@ def wrapper(*args, **kwargs): return wrapper return deco + def rename_kwargs(func_name, kwargs, aliases): """ https://stackoverflow.com/questions/49802412/how-to-implement-deprecation-in-python-with-argument-alias diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 6f2524608..56da7d9a4 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -39,14 +39,15 @@ 'xywh_from_polygon', ] + def adjust_canvas_to_rotation(size, angle): """Calculate the enlarged image size after rotation. - + Given a numpy array ``size`` of an original canvas (width and height), and a rotation angle in degrees counter-clockwise ``angle``, calculate the new size which is necessary to encompass the full image after rotation. - + Return a numpy array of the enlarged width and height. """ angle = np.deg2rad(angle) @@ -56,13 +57,14 @@ def adjust_canvas_to_rotation(size, angle): [sin, cos]]), np.array(size)) + def adjust_canvas_to_transposition(size, method): """Calculate the flipped image size after transposition. - + Given a numpy array ``size`` of an original canvas (width and height), and a transposition mode ``method`` (see ``transpose_image``), calculate the new size after transposition. - + Return a numpy array of the enlarged width and height. """ if method in [Image.Transpose.ROTATE_90, @@ -72,11 +74,13 @@ def adjust_canvas_to_transposition(size, method): size = size[::-1] return size + def bbox_from_points(points): """Construct a numeric list representing a bounding box from polygon coordinates in page representation.""" xys = [[int(p) for p in pair.split(',')] for pair in points.split(' ')] return bbox_from_polygon(xys) + def bbox_from_polygon(polygon): """Construct a numeric list representing a bounding box from polygon coordinates in numeric list representation.""" minx = sys.maxsize @@ -94,6 +98,7 @@ def bbox_from_polygon(polygon): maxy = xy[1] return minx, miny, maxx, maxy + def bbox_from_xywh(xywh): """Convert a bounding box from a numeric dict to a numeric list representation.""" return ( @@ -103,23 +108,24 @@ def bbox_from_xywh(xywh): xywh['y'] + xywh['h'] ) + def coordinates_of_segment(segment, parent_image, parent_coords): """Extract the coordinates of a PAGE segment element relative to its parent. + \b Given... - - ``segment``, a PAGE segment object in absolute coordinates (i.e. RegionType / TextLineType / WordType / GlyphType), and - ``parent_image``, the PIL.Image of its corresponding parent object (i.e. PageType / RegionType / TextLineType / WordType), (not used), along with - ``parent_coords``, its corresponding affine transformation, - ...calculate the relative coordinates of the segment within the image. That is, apply the given transform to the points annotated in ``segment``. The transform encodes (recursively): + \b 1. Whenever ``parent_image`` or any of its parents was cropped, all points must be shifted by the offset (i.e. coordinate system gets translated by the upper left). @@ -138,6 +144,7 @@ def coordinates_of_segment(segment, parent_image, parent_coords): polygon = transform_coordinates(polygon, parent_coords['transform']) return np.round(polygon).astype(np.int32) + def polygon_from_points(points): """ Convert polygon coordinates in page representation to polygon coordinates in numeric list representation. @@ -152,17 +159,17 @@ def polygon_from_points(points): def coordinates_for_segment(polygon, parent_image, parent_coords): """Convert relative coordinates to absolute. + \b Given... - - ``polygon``, a numpy array of points relative to - ``parent_image``, a PIL.Image (not used), along with - ``parent_coords``, its corresponding affine transformation, - ...calculate the absolute coordinates within the page. - + That is, apply the given transform inversely to ``polygon`` The transform encodes (recursively): + \b 1. Whenever ``parent_image`` or any of its parents was cropped, all points must be shifted by the offset in opposite direction (i.e. coordinate system gets translated by the upper left). @@ -176,12 +183,13 @@ def coordinates_for_segment(polygon, parent_image, parent_coords): Return the rounded numpy array of the resulting polygon. """ - polygon = np.array(polygon, dtype=np.float32) # avoid implicit type cast problems + polygon = np.array(polygon, dtype=np.float32) # avoid implicit type cast problems # apply inverse of affine transform: inv_transform = np.linalg.inv(parent_coords['transform']) polygon = transform_coordinates(polygon, inv_transform) return np.round(polygon).astype(np.int32) + def polygon_mask(image, coordinates): """"Create a mask image of a polygon. @@ -197,6 +205,7 @@ def polygon_mask(image, coordinates): ImageDraw.Draw(mask).polygon(coordinates, outline=0, fill=255) return mask + def rotate_coordinates(transform, angle, orig=np.array([0, 0])): """Compose an affine coordinate transformation with a passive rotation. @@ -209,7 +218,7 @@ def rotate_coordinates(transform, angle, orig=np.array([0, 0])): by pure rotation, and subsequent translation back. However, since rotation necessarily increases the bounding box, and thus image size, do not translate back the same amount, but to the enlarged offset.) - + Return a numpy array of the resulting affine transformation matrix. """ LOG = getLogger('ocrd.utils.coords.rotate_coordinates') @@ -234,6 +243,7 @@ def rotate_coordinates(transform, angle, orig=np.array([0, 0])): adjust_canvas_to_rotation(orig, angle)) return transform + def rotate_image(image, angle, fill='background', transparency=False): """"Rotate an image, enlarging and filling with background. @@ -242,6 +252,7 @@ def rotate_image(image, angle, fill='background', transparency=False): size at the margins accordingly, and filling everything outside the original image according to ``fill``: + \b - if ``background`` (the default), then use the median color of the image; - otherwise use the given color, e.g. ``'white'`` or (255,255,255). @@ -267,7 +278,7 @@ def rotate_image(image, angle, fill='background', transparency=False): if len(background.bands) > 1: background = background.median if image.mode in ['RGBA', 'LA']: - background[-1] = 0 # fully transparent + background[-1] = 0 # fully transparent background = tuple(background) else: background = background.median[0] @@ -295,7 +306,7 @@ def shift_coordinates(transform, offset): ``offset`` of the translation vector, calculate the affine coordinate transform corresponding to the composition of both transformations. - + Return a numpy array of the resulting affine transformation matrix. """ LOG = getLogger('ocrd.utils.coords.shift_coordinates') @@ -305,6 +316,7 @@ def shift_coordinates(transform, offset): shift[1, 2] = offset[1] return np.dot(shift, transform) + def scale_coordinates(transform, factors): """Compose an affine coordinate transformation with a proportional scaling. Given a numpy array ``transform`` of an existing transformation @@ -312,7 +324,7 @@ def scale_coordinates(transform, factors): ``factors`` of the scaling factors, calculate the affine coordinate transform corresponding to the composition of both transformations. - + Return a numpy array of the resulting affine transformation matrix. """ LOG = getLogger('ocrd.utils.coords.scale_coordinates') @@ -322,6 +334,7 @@ def scale_coordinates(transform, factors): scale[1, 1] = factors[1] return np.dot(scale, transform) + def transform_coordinates(polygon, transform=None): """Apply an affine transformation to a set of points. Augment the 2d numpy array of points ``polygon`` with a an extra @@ -331,23 +344,24 @@ def transform_coordinates(polygon, transform=None): """ if transform is None: transform = np.eye(3) - polygon = np.insert(polygon, 2, 1, axis=1) # make 3d homogeneous coordinates + polygon = np.insert(polygon, 2, 1, axis=1) # make 3d homogeneous coordinates polygon = np.dot(transform, polygon.T).T # ones = polygon[:,2] # assert np.all(np.array_equal(ones, np.clip(ones, 1 - 1e-2, 1 + 1e-2))), \ # 'affine transform failed' # should never happen - polygon = np.delete(polygon, 2, axis=1) # remove z coordinate again + polygon = np.delete(polygon, 2, axis=1) # remove z coordinate again return polygon + def transpose_coordinates(transform, method, orig=np.array([0, 0])): """"Compose an affine coordinate transformation with a transposition (i.e. flip or rotate in 90° multiples). + \b Given a numpy array ``transform`` of an existing transformation matrix in homogeneous (3d) coordinates, a transposition mode ``method``, as well as a numpy array ``orig`` of the center of the image, calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back @@ -395,7 +409,7 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): Image.Transpose.ROTATE_270: [rot90, reflx, refly], Image.Transpose.TRANSPOSE: [rot90, reflx], Image.Transpose.TRANSVERSE: [rot90, refly] - }.get(method) # no default + }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) transform = shift_coordinates( @@ -405,12 +419,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): adjust_canvas_to_transposition(orig, method)) return transform + def transpose_image(image, method): """"Transpose (i.e. flip or rotate in 90° multiples) an image. + \b Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: @@ -438,13 +453,14 @@ def transpose_image(image, method): columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; width becomes height and vice versa - + Return a new PIL.Image. """ LOG = getLogger('ocrd.utils.transpose_image') LOG.debug('transposing image with %s', membername(Image, method)) return image.transpose(method) + def crop_image(image, box=None): """"Crop an image to a rectangle, filling with background. @@ -475,10 +491,11 @@ def crop_image(image, box=None): else: background = background.median[0] new_image = Image.new(image.mode, (xywh['w'], xywh['h']), - background) # or 'white' + background) # or 'white' new_image.paste(image, (-xywh['x'], -xywh['y'])) return new_image + def image_from_polygon(image, polygon, fill='background', transparency=False): """"Mask an image with a polygon. @@ -497,7 +514,7 @@ def image_from_polygon(image, polygon, fill='background', transparency=False): Images which already have an alpha channel will have it shrunk from the polygon mask (i.e. everything outside the polygon will be transparent, in addition to existing transparent pixels). - + Return a new PIL.Image. """ if fill == 'none' or fill is None: @@ -521,22 +538,25 @@ def image_from_polygon(image, polygon, fill='background', transparency=False): # which can be inconsistent on unbinarized images): if image.mode in ['RGBA', 'LA']: # ensure transparency maximizes (i.e. parent mask AND mask): - mask = ImageChops.darker(mask, image.getchannel('A')) # min opaque + mask = ImageChops.darker(mask, image.getchannel('A')) # min opaque new_image.putalpha(mask) elif transparency and image.mode in ['RGB', 'L']: # introduce transparency: new_image.putalpha(mask) return new_image + def points_from_bbox(minx, miny, maxx, maxy): """Construct polygon coordinates in page representation from a numeric list representing a bounding box.""" return "%i,%i %i,%i %i,%i %i,%i" % ( minx, miny, maxx, miny, maxx, maxy, minx, maxy) + def points_from_polygon(polygon): """Convert polygon coordinates from a numeric list representation to a page representation.""" return " ".join("%i,%i" % (x, y) for x, y in polygon) + def points_from_xywh(box): """ Construct polygon coordinates in page representation from numeric dict representing a bounding box. @@ -549,6 +569,8 @@ def points_from_xywh(box): x + w, y + h, x, y + h ) + + def points_from_y0x0y1x1(yxyx): """ Construct a polygon representation from a rectangle described as a list [y0, x0, y1, x1] @@ -564,6 +586,7 @@ def points_from_y0x0y1x1(yxyx): x0, y1 ) + def points_from_x0y0x1y1(xyxy): """ Construct a polygon representation from a rectangle described as a list [x0, y0, x1, y1] @@ -579,10 +602,12 @@ def points_from_x0y0x1y1(xyxy): x0, y1 ) + def polygon_from_bbox(minx, miny, maxx, maxy): """Construct polygon coordinates in numeric list representation from a numeric list representing a bounding box.""" return [[minx, miny], [maxx, miny], [maxx, maxy], [minx, maxy]] + def polygon_from_x0y0x1y1(x0y0x1y1): """Construct polygon coordinates in numeric list representation from a string list representing a bounding box.""" minx = int(x0y0x1y1[0]) @@ -591,10 +616,12 @@ def polygon_from_x0y0x1y1(x0y0x1y1): maxy = int(x0y0x1y1[3]) return [[minx, miny], [maxx, miny], [maxx, maxy], [minx, maxy]] + def polygon_from_xywh(xywh): """Construct polygon coordinates in numeric list representation from numeric dict representing a bounding box.""" return polygon_from_bbox(*bbox_from_xywh(xywh)) + def xywh_from_bbox(minx, miny, maxx, maxy): """Convert a bounding box from a numeric list to a numeric dict representation.""" return { @@ -604,6 +631,7 @@ def xywh_from_bbox(minx, miny, maxx, maxy): 'h': maxy - miny, } + def xywh_from_points(points): """ Construct a numeric dict representing a bounding box from polygon coordinates in page representation. diff --git a/src/ocrd_utils/introspect.py b/src/ocrd_utils/introspect.py index 6734d316d..11cdf546a 100644 --- a/src/ocrd_utils/introspect.py +++ b/src/ocrd_utils/introspect.py @@ -23,6 +23,7 @@ file_manager = ExitStack() atexit.register(file_manager.close) + # Taken from https://github.com/OCR-D/core/pull/884 def freeze_args(func): """ @@ -41,6 +42,7 @@ def membername(class_, val): """Convert a member variable/constant into a member name string.""" return next((k for k, v in class_.__dict__.items() if v == val), str(val)) + def set_json_key_value_overrides(obj, *kvpairs): for kv in kvpairs: k, v = kv @@ -50,13 +52,16 @@ def set_json_key_value_overrides(obj, *kvpairs): obj[k] = v return obj -def resource_filename(pkg : str, fname : str) -> Path: + +def resource_filename(pkg: str, fname: str) -> Path: ref = importlib_resources.files(pkg) / fname return file_manager.enter_context(importlib_resources.as_file(ref)) -def resource_string(pkg : str, fname : str) -> str: + +def resource_string(pkg: str, fname: str) -> str: with open(resource_filename(pkg, fname), 'r', encoding='utf-8') as f: return f.read() -def dist_version(module : str) -> str: + +def dist_version(module: str) -> str: return importlib_metadata.version(module) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index cd3d9e955..f11022f5a 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -73,12 +73,13 @@ 'FATAL': 'ERROR', } + def tf_disable_interactive_logs(): try: - from os import environ # pylint: disable=import-outside-toplevel + from os import environ # pylint: disable=import-outside-toplevel # This env variable must be set before importing from Keras environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel + from tensorflow.keras.utils import disable_interactive_logging # pylint: disable=import-outside-toplevel # Enabled interactive logging throws an exception # due to a call of sys.stdout.flush() disable_interactive_logging() @@ -86,6 +87,7 @@ def tf_disable_interactive_logs(): # Nothing should be handled here if TF is not available pass + def getLevelName(lvl): """ Get (string) python logging level for (string) spec-defined log level name. @@ -93,6 +95,7 @@ def getLevelName(lvl): lvl = _ocrdLevel2pythonLevel.get(lvl, lvl) return logging.getLevelName(lvl) + def getLogger(*args, **kwargs): """ Wrapper around ``logging.getLogger`` that calls :py:func:`initLogging` if @@ -101,6 +104,7 @@ def getLogger(*args, **kwargs): logger = logging.getLogger(*args, **kwargs) return logger + def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): """ Override the output log level of the handlers attached to the ``ocrd`` logger. @@ -119,6 +123,7 @@ def setOverrideLogLevel(lvl, silent=not config.OCRD_LOGGING_DEBUG): print(f'[LOGGING] Overriding {logger_name} log level to {lvl}', file=sys.stderr) logging.getLogger(logger_name).setLevel(lvl) + def get_logging_config_files(): """ Return a list of all ``ocrd_logging.conf`` files found in CWD, HOME or /etc. @@ -128,9 +133,9 @@ def get_logging_config_files(): Path.home(), Path('/etc'), ] - return [f for f \ - in [p / 'ocrd_logging.conf' for p in CONFIG_PATHS] \ - if f.exists()] + return [file for file in [path / 'ocrd_logging.conf' for path in CONFIG_PATHS] + if file.exists()] + def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_LOGGING_DEBUG): """ @@ -189,6 +194,7 @@ def initLogging(builtin_only=False, force_reinit=False, silent=not config.OCRD_L logging.getLogger(logger_name).setLevel(logger_level) _initialized_flag = True + def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): """ Disables all logging of the ``ocrd`` logger and descendants @@ -196,7 +202,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): Keyword Args: - silent (bool, True): Whether to log logging behavior by printing to stderr """ - global _initialized_flag # pylint: disable=global-statement + global _initialized_flag # pylint: disable=global-statement if _initialized_flag and not silent: print("[LOGGING] Disabling logging", file=sys.stderr) _initialized_flag = False @@ -212,4 +218,3 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): logging.root.removeHandler(handler) # Python default log level is WARNING logging.root.setLevel(logging.WARNING) - diff --git a/src/ocrd_utils/os.py b/src/ocrd_utils/os.py index 024fe0af9..65b6a03aa 100644 --- a/src/ocrd_utils/os.py +++ b/src/ocrd_utils/os.py @@ -36,6 +36,7 @@ from .logging import getLogger from .introspect import resource_string + def abspath(url): """ Get a full path to a file or file URL @@ -46,6 +47,7 @@ def abspath(url): url = url[len('file://'):] return abspath_(url) + @contextmanager def pushd_popd(newcwd=None, tempdir=False): if newcwd and tempdir: @@ -67,6 +69,7 @@ def pushd_popd(newcwd=None, tempdir=False): finally: chdir(oldcwd) + def unzip_file_to_dir(path_to_zip, output_directory): """ Extract a ZIP archive to a directory @@ -74,13 +77,13 @@ def unzip_file_to_dir(path_to_zip, output_directory): with ZipFile(path_to_zip, 'r') as z: z.extractall(output_directory) + @lru_cache() def get_ocrd_tool_json(executable): """ Get the ``ocrd-tool`` description of ``executable``. """ ocrd_tool = {} - executable_name = Path(executable).name try: ocrd_all_tool = loads(resource_string('ocrd', 'ocrd-all-tool.json')) ocrd_tool = ocrd_all_tool[executable] @@ -93,6 +96,7 @@ def get_ocrd_tool_json(executable): ocrd_tool['resource_locations'] = ['data', 'cwd', 'system', 'module'] return ocrd_tool + @lru_cache() def get_moduledir(executable): moduledir = None @@ -106,6 +110,7 @@ def get_moduledir(executable): getLogger('ocrd.utils.get_moduledir').error(f'{executable} --dump-module-dir failed: {e}') return moduledir + def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_data_home=None): """ Generate candidates for processor resources according to @@ -123,6 +128,7 @@ def list_resource_candidates(executable, fname, cwd=getcwd(), moduled=None, xdg_ candidates.append(join(moduled, fname)) return candidates + def list_all_resources(executable, moduled=None, xdg_data_home=None): """ List all processor resources in the filesystem according to @@ -164,7 +170,7 @@ def list_all_resources(executable, moduled=None, xdg_data_home=None): # code and data; `is_resource()` only singles out # files over directories; but we want data files only # todo: more code and cache exclusion patterns! - ['*.py', '*.py[cod]', '*~', 'ocrd-tool.json', + ['*.py', '*.py[cod]', '*~', 'ocrd-tool.json', 'environment.pickle', 'resource_list.yml', 'lib.bash']): continue candidates.append(resource) @@ -174,6 +180,7 @@ def list_all_resources(executable, moduled=None, xdg_data_home=None): candidates += parent.iterdir() return sorted([str(x) for x in candidates]) + def get_processor_resource_types(executable, ocrd_tool=None): """ Determine what type of resource parameters a processor needs. @@ -194,6 +201,7 @@ def get_processor_resource_types(executable, ocrd_tool=None): return [p['content-type'] for p in ocrd_tool['parameters'].values() if 'content-type' in p] + # ht @pabs3 # https://github.com/untitaker/python-atomicwrites/issues/42 class AtomicWriterPerms(AtomicWriter): @@ -210,6 +218,7 @@ def get_fileobject(self, **kwargs): chmod(fd, mode) return f + @contextmanager def atomic_write(fpath): with atomic_write_(fpath, writer_cls=AtomicWriterPerms, overwrite=True) as f: @@ -224,6 +233,7 @@ def is_file_in_directory(directory, file): file = Path(file) return list(file.parts)[:len(directory.parts)] == list(directory.parts) + def itertree(path): """ Generate a list of paths by recursively enumerating ``path`` @@ -235,6 +245,7 @@ def itertree(path): yield from itertree(subpath) yield path + def directory_size(path): """ Calculates size of all files in directory ``path`` @@ -242,7 +253,8 @@ def directory_size(path): path = Path(path) return sum(f.stat().st_size for f in path.glob('**/*') if f.is_file()) -def guess_media_type(input_file : str, fallback : str = None, application_xml : str = 'application/xml'): + +def guess_media_type(input_file: str, fallback: str = None, application_xml: str = 'application/xml'): """ Guess the media type of a file path """ @@ -259,6 +271,7 @@ def guess_media_type(input_file : str, fallback : str = None, application_xml : mimetype = application_xml return mimetype + @contextmanager def redirect_stderr_and_stdout_to_file(filename): with open(filename, 'at', encoding='utf-8') as f: diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index 13d03cc5b..c9d175621 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -37,13 +37,14 @@ def assert_file_grp_cardinality(grps, n, msg=None): if isinstance(grps, str): grps = grps.split(',') assert len(grps) == n, \ - "Expected exactly %d output file group%s%s, but '%s' has %d" % ( - n, - '' if n == 1 else 's', - ' (%s)' % msg if msg else '', - grps, - len(grps) - ) + "Expected exactly %d output file group%s%s, but '%s' has %d" % ( + n, + '' if n == 1 else 's', + ' (%s)' % msg if msg else '', + grps, + len(grps) + ) + def concat_padded(base, *args): """ @@ -54,18 +55,20 @@ def concat_padded(base, *args): if is_string(n): ret = "%s_%s" % (ret, n) else: - ret = "%s_%04i" % (ret, n) + ret = "%s_%04i" % (ret, n) return ret + def remove_non_path_from_url(url): """ Remove everything from URL after path. """ - url = url.split('?', 1)[0] # query - url = url.split('#', 1)[0] # fragment identifier - url = re.sub(r"/+$", "", url) # trailing slashes + url = url.split('?', 1)[0] # query + url = url.split('#', 1)[0] # fragment identifier + url = re.sub(r"/+$", "", url) # trailing slashes return url + def make_file_id(ocrd_file, output_file_grp): """ Derive a new file ID for an output file from an existing input file ``ocrd_file`` @@ -101,9 +104,12 @@ def make_file_id(ocrd_file, output_file_grp): ret = output_file_grp + '_' + ocrd_file.ID return make_xml_id(ret) + def make_xml_id(idstr: str) -> str: """ - Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number. + Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, + removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` + if ``idstr`` starts with a number. """ ret = idstr if not REGEX_FILE_ID.fullmatch(ret): @@ -113,6 +119,7 @@ def make_xml_id(idstr: str) -> str: ret = re.sub(r'[^\w.-]', r'', ret) return ret + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string @@ -127,6 +134,7 @@ def nth_url_segment(url, n=-1): except IndexError: return '' + def get_local_filename(url, start=None): """ Return local filename, optionally relative to ``start`` @@ -150,12 +158,14 @@ def get_local_filename(url, start=None): url = url[len(start):] return url + def is_local_filename(url): """ Whether a url is a local filename. """ # deprecation_warning("Deprecated so we spot inconsistent URL/file handling") - return url.startswith('file://') or not('://' in url) + return url.startswith('file://') or '://' not in url + def is_string(val): """ @@ -171,6 +181,7 @@ def parse_json_file_with_comments(val): with open(val, 'r', encoding='utf-8') as inputf: return parse_json_string_with_comments(inputf.read()) + def parse_json_string_with_comments(val): """ Parse a string of JSON interspersed with #-prefixed full-line comments @@ -178,6 +189,7 @@ def parse_json_string_with_comments(val): jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE) return json.loads(jsonstr) + def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: disable=unused-argument """ Parse a string as either the path to a JSON object or a literal JSON object. @@ -208,6 +220,7 @@ def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: d ret = {**ret, **value_parsed} return ret + def safe_filename(url): """ Sanitize input to be safely used as the basename of a local file. @@ -218,7 +231,8 @@ def safe_filename(url): # print('safe filename: %s -> %s' % (url, ret)) return ret -def generate_range(start : str, end : str) -> List[str]: + +def generate_range(start: str, end: str) -> List[str]: """ Generate a list of strings by incrementing the number part of ``start`` until including ``end``. """ @@ -228,7 +242,8 @@ def generate_range(start : str, end : str) -> List[str]: except IndexError: raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) if start[:-len(start_num)] != end[:-len(end_num)]: - raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") + raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: " + f"'{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") if start_num == end_num: warn("Range '%s..%s': evaluates to the same number") for i in range(int(start_num), int(end_num) + 1): @@ -261,7 +276,8 @@ def partition_list(lst, chunks, chunk_index=None): return [ret[chunk_index]] return ret -def sparkline(values : List[int]) -> str: + +def sparkline(values: List[int]) -> str: """ Render a list of points with block characters """ diff --git a/src/ocrd_validators/json_validator.py b/src/ocrd_validators/json_validator.py index f21a23afe..26eae45dc 100644 --- a/src/ocrd_validators/json_validator.py +++ b/src/ocrd_validators/json_validator.py @@ -7,9 +7,11 @@ from ocrd_models import ValidationReport + class JsonSchemaDeprecationWarning(ValidationError): pass + # http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): """ @@ -34,6 +36,7 @@ def set_defaults_and_handle_deprecate(validator, properties, instance, schema): DefaultValidatingDraft20199Validator = extend_with_default(Draft201909Validator) + # # ------------------------------------------------- # @@ -54,7 +57,7 @@ def validate(obj, schema): """ if isinstance(obj, str): obj = json.loads(obj) - return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access + return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access def __init__(self, schema, validator_class=Draft201909Validator): """ diff --git a/src/ocrd_validators/ocrd_tool_validator.py b/src/ocrd_validators/ocrd_tool_validator.py index 00a402c12..9b5888e33 100644 --- a/src/ocrd_validators/ocrd_tool_validator.py +++ b/src/ocrd_validators/ocrd_tool_validator.py @@ -6,6 +6,7 @@ from .constants import OCRD_TOOL_SCHEMA from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator + # # ------------------------------------------------- # @@ -20,7 +21,7 @@ def validate(obj, schema=OCRD_TOOL_SCHEMA): """ Validate against ``ocrd-tool.json`` schema. """ - return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access + return OcrdToolValidator(schema)._validate(obj) # pylint: disable=protected-access def __init__(self, schema): super().__init__(schema, validator_class=DefaultValidatingDraft20199Validator) diff --git a/src/ocrd_validators/ocrd_zip_validator.py b/src/ocrd_validators/ocrd_zip_validator.py index 5f9d21f8c..be347a230 100644 --- a/src/ocrd_validators/ocrd_zip_validator.py +++ b/src/ocrd_validators/ocrd_zip_validator.py @@ -8,12 +8,13 @@ from ocrd_utils import getLogger, unzip_file_to_dir -from bagit import Bag, BagValidationError # pylint: disable=no-name-in-module -from bagit_profile import Profile, ProfileValidationError # pylint: disable=no-name-in-module +from bagit import Bag, BagValidationError # pylint: disable=no-name-in-module +from bagit_profile import Profile, ProfileValidationError # pylint: disable=no-name-in-module from .constants import OCRD_BAGIT_PROFILE, OCRD_BAGIT_PROFILE_URL, TMP_BAGIT_PREFIX from ocrd_models import ValidationReport + # # ------------------------------------------------- # @@ -58,7 +59,8 @@ def _validate_bag(self, bag, **kwargs): # for d in e.details: # log = getLogger('ocrd.ocrd_zip_validator') # if isinstance(d, ChecksumMismatch): - # log.error("Validation Error: expected %s to have %s checksum of %s but found %s", d.path, d.algorithm, d.expected, d.found) + # log.error("Validation Error: expected %s to have %s checksum of %s but found %s", + # d.path, d.algorithm, d.expected, d.found) # else: # log.error("Validation Error: %s", d) if failed: @@ -89,7 +91,6 @@ def validate(self, skip_checksums=False, skip_bag=False, skip_unzip=False, skip_ bagdir = mkdtemp(prefix=TMP_BAGIT_PREFIX) unzip_file_to_dir(self.path_to_zip, bagdir) - try: bag = Bag(bagdir) self._validate_profile(bag) diff --git a/src/ocrd_validators/page_validator.py b/src/ocrd_validators/page_validator.py index 0459f1781..0d4e6666c 100644 --- a/src/ocrd_validators/page_validator.py +++ b/src/ocrd_validators/page_validator.py @@ -119,6 +119,7 @@ def __init__(self, tag, ID, file_id, actual, expected): f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " f"text results '{actual}' != concatenated '{expected}'") + class CoordinateConsistencyError(Exception): """ Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML. @@ -145,6 +146,7 @@ def __init__(self, tag, ID, file_id, outer, inner): f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " f"coords '{inner}' not within parent coords '{outer}'") + class CoordinateValidityError(Exception): """ Exception representing a validity error of an element's coordinates in PAGE-XML. @@ -169,12 +171,14 @@ def __init__(self, tag, ID, file_id, points, reason='unknown'): super().__init__( f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") + def compare_without_whitespace(a, b): """ Compare two strings, ignoring all whitespace. """ return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) + def page_get_reading_order(ro, rogroup): """ Add all elements from the given reading order group to the given dictionary. @@ -197,6 +201,7 @@ def page_get_reading_order(ro, rogroup): if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): page_get_reading_order(ro, elem) + def make_poly(polygon_points): """Instantiate a Polygon from a list of point pairs, or return an error string""" if len(polygon_points) < 4: @@ -212,6 +217,7 @@ def make_poly(polygon_points): return 'is negative' return poly + def make_line(line_points): """Instantiate a LineString from a list of point pairs, or return an error string""" if len(line_points) < 2: @@ -225,6 +231,7 @@ def make_line(line_points): return 'is negative' return line + @deprecated_alias(strictness='page_textequiv_consistency') @deprecated_alias(strategy='page_textequiv_strategy') def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy, @@ -239,7 +246,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate if isinstance(node, (PcGtsType, OcrdPage)): # top-level (start recursion) node_id = node.get_pcGtsId() - node = node.get_Page() # has no .id + node = node.get_Page() # has no .id if not readingOrder: readingOrder = {} ro = node.get_ReadingOrder() @@ -247,13 +254,13 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) if not joinRelations: joinRelations = [] - relations = node.get_Relations() # get RelationsType + relations = node.get_Relations() # get RelationsType if relations: - relations = relations.get_Relation() # get list of RelationType + relations = relations.get_Relation() # get list of RelationType else: relations = [] for relation in relations: - if relation.get_type() == 'join': # ignore 'link' type here + if relation.get_type() == 'join': # ignore 'link' type here joinRelations.append((relation.get_SourceRegionRef().get_regionRef(), relation.get_TargetRegionRef().get_regionRef())) elif isinstance(node, GlyphType): @@ -277,7 +284,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate parent_points, node_poly)) log.debug("Invalid coords of %s %s", tag, node_id) consistent = False - node_poly = None # don't use in further comparisons + node_poly = None # don't use in further comparisons else: node_poly = None for class_, getterLO, getterRD in _ORDER[1:]: @@ -314,7 +321,7 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate # report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points)) # log.debug("Invalid coords of %s %s", child_tag, child.id) # consistent = False - pass # already reported in recursive call above + pass # already reported in recursive call above elif not child_poly.within(node_poly.buffer(PARENT_SLACK)): # TODO: automatic repair? report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id, @@ -344,13 +351,14 @@ def validate_consistency(node, page_textequiv_consistency, page_textequiv_strate if page_textequiv_consistency == 'fix': log.debug("Repaired text of %s %s", tag, node_id) set_text(node, concatenated, page_textequiv_strategy) - elif (page_textequiv_consistency == 'strict' # or 'lax' but... + elif (page_textequiv_consistency == 'strict' # or 'lax' but... or not compare_without_whitespace(concatenated, text_results)): log.debug("Inconsistent text of %s %s", tag, node_id) report.add_error(ConsistencyError(tag, node_id, file_id, text_results, concatenated)) return consistent + def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): """ Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels @@ -367,6 +375,7 @@ def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): result += get_text(next_node, page_textequiv_strategy) return result.strip() + def get_text(node, page_textequiv_strategy='first'): """ Get the first or most confident among text results (depending on ``page_textequiv_strategy``). @@ -399,6 +408,7 @@ def get_text(node, page_textequiv_strategy='first'): # fall back to first element return textEquivs[0].get_Unicode().strip() + def set_text(node, text, page_textequiv_strategy): """ Set the first or most confident among text results (depending on ``page_textequiv_strategy``). @@ -410,7 +420,7 @@ def set_text(node, text, page_textequiv_strategy): text = text.strip() textEquivs = node.get_TextEquiv() if not textEquivs: - node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ? + node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ? elif page_textequiv_strategy == 'best': if len(textEquivs) > 1: textEquivsSorted = sorted([x for x in textEquivs if x.conf], @@ -432,6 +442,7 @@ def set_text(node, text, page_textequiv_strategy): # fall back to first element textEquivs[0].set_Unicode(text) + class PageValidator(): """ Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`. @@ -477,5 +488,6 @@ def validate(filename=None, ocrd_page=None, ocrd_file=None, raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) - validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) + validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, + report, file_id) return report diff --git a/src/ocrd_validators/parameter_validator.py b/src/ocrd_validators/parameter_validator.py index ca2a7ed8e..f537f82f4 100644 --- a/src/ocrd_validators/parameter_validator.py +++ b/src/ocrd_validators/parameter_validator.py @@ -3,6 +3,7 @@ """ from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator + # # ------------------------------------------------- # @@ -12,7 +13,7 @@ class ParameterValidator(JsonValidator): JsonValidator validating parametersagains ocrd-tool.json. """ - def validate(self, *args, **kwargs): # pylint: disable=arguments-differ + def validate(self, *args, **kwargs): # pylint: disable=arguments-differ """ Validate a parameter dict against a parameter schema from an ocrd-tool.json @@ -39,7 +40,7 @@ def __init__(self, ocrd_tool): if 'required' in p[n]: if p[n]['required']: required.append(n) - del(p[n]['required']) + del p[n]['required'] super().__init__({ "type": "object", "required": required, diff --git a/src/ocrd_validators/resource_list_validator.py b/src/ocrd_validators/resource_list_validator.py index 47f3c81a9..fcf165407 100644 --- a/src/ocrd_validators/resource_list_validator.py +++ b/src/ocrd_validators/resource_list_validator.py @@ -6,6 +6,7 @@ from .constants import RESOURCE_LIST_SCHEMA from .json_validator import DefaultValidatingDraft20199Validator, JsonValidator + # # ------------------------------------------------- # @@ -22,4 +23,5 @@ def validate(obj, schema=None): """ if schema is None: schema = RESOURCE_LIST_SCHEMA - return JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator)._validate(obj) # pylint: disable=protected-access + validator = JsonValidator(schema, validator_class=DefaultValidatingDraft20199Validator) + return validator._validate(obj) # pylint: disable=protected-access diff --git a/src/ocrd_validators/workspace_validator.py b/src/ocrd_validators/workspace_validator.py index cd647a9af..f60ce5a48 100644 --- a/src/ocrd_validators/workspace_validator.py +++ b/src/ocrd_validators/workspace_validator.py @@ -15,6 +15,7 @@ from .xsd_page_validator import XsdPageValidator from .xsd_mets_validator import XsdMetsValidator + # # ------------------------------------------------- # @@ -57,7 +58,8 @@ def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id if page_id: for one_page_id in page_id: if next(workspace.mets.find_files(fileGrp=grp, pageId=one_page_id), None): - report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % (grp, one_page_id)) + report.add_error("Output fileGrp[@USE='%s'] already contains output for page %s" % ( + grp, one_page_id)) else: report.add_error("Output fileGrp[@USE='%s'] already in METS!" % grp) return report @@ -121,10 +123,10 @@ def validate(*args, **kwargs): resolver (:class:`ocrd.Resolver`): Resolver mets_url (string): URL of the METS file src_dir (string, None): Directory containing mets file - skip (list): Validation checks to omit. One or more of + skip (list): Validation checks to omit. One or more of 'mets_unique_identifier', 'mets_files', 'pixel_density', 'dimension', 'url', - 'multipage', 'page', 'page_xsd', 'mets_xsd', + 'multipage', 'page', 'page_xsd', 'mets_xsd', 'mets_fileid_page_pcgtsid' download (boolean): Whether to download remote file references temporarily during validation (like a processor would) @@ -133,7 +135,7 @@ def validate(*args, **kwargs): report (:class:`ValidationReport`) Report on the validity """ validator = WorkspaceValidator(*args, **kwargs) - return validator._validate() # pylint: disable=protected-access + return validator._validate() # pylint: disable=protected-access def _validate(self): """ @@ -141,7 +143,7 @@ def _validate(self): """ try: self._resolve_workspace() - except Exception as e: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except self.log.warning("Failed to instantiate workspace: %s", e) self.report.add_error(f"Failed to instantiate workspace: {e}") return self.report @@ -159,7 +161,7 @@ def _validate(self): self._validate_mets_xsd() if self.page_checks: self._validate_page() - except Exception: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except self.report.add_error(f"Validation aborted with exception: {format_exc()}") return self.report @@ -216,9 +218,11 @@ def _validate_dimension(self): page = page_from_file(f).get_Page() _, _, exif = self.workspace.image_from_page(page, f.pageId) if page.imageHeight != exif.height: - self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {exif.height})") + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height " + f"({page.imageHeight} != {exif.height})") if page.imageWidth != exif.width: - self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {exif.width})") + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width " + f"({page.imageWidth} != {exif.width})") def _validate_multipage(self): """ @@ -237,7 +241,8 @@ def _validate_multipage(self): if exif.n_frames > 1: self.report.add_error(f"Image '{f.ID}': More than 1 frame: {exif.n_frames}") except FileNotFoundError: - self.report.add_error(f"Image '{f.ID}': Could not retrieve (local_filename='{f.local_filename}', url='{f.url}')") + self.report.add_error(f"Image '{f.ID}': Could not retrieve " + f"(local_filename='{f.local_filename}', url='{f.url}')") return def _validate_pixel_density(self): @@ -293,10 +298,11 @@ def _validate_mets_files(self): except StopIteration: self.report.add_error("No files") for f in self.mets.find_files(**self.find_kwargs): - if f._el.get('GROUPID'): # pylint: disable=protected-access + if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice(f"File '{f.ID}' has GROUPID attribute - document might need an update") if not (f.url or f.local_filename): - self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href") + self.report.add_error(f"File '{f.ID}' has neither mets:Flocat[@LOCTYPE='URL']/@xlink:href " + "nor mets:FLocat[@LOCTYPE='OTHER'][@OTHERLOCTYPE='FILE']/xlink:href") continue if f.url and 'url' not in self.skip: if re.match(r'^file:/[^/]', f.url): @@ -322,19 +328,22 @@ def _validate_page(self): for err in XsdPageValidator.validate(Path(f.local_filename)).errors: self.report.add_error("%s: %s" % (f.ID, err)) if 'page' in self.page_checks: - page_report = PageValidator.validate(ocrd_file=f, - page_textequiv_consistency=self.page_strictness, - check_coords=self.page_coordinate_consistency in ['poly', 'both'], - check_baseline=self.page_coordinate_consistency in ['baseline', 'both']) + page_report = PageValidator.validate( + ocrd_file=f, + page_textequiv_consistency=self.page_strictness, + check_coords=self.page_coordinate_consistency in ['poly', 'both'], + check_baseline=self.page_coordinate_consistency in ['baseline', 'both']) self.report.merge_report(page_report) pcgts = page_from_file(f) page = pcgts.get_Page() if 'dimension' in self.page_checks: img = self.workspace._resolve_image_as_pil(page.imageFilename) if page.imageHeight != img.height: - self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height ({page.imageHeight} != {img.height})") + self.report.add_error(f"PAGE '{f.ID}': @imageHeight != image's actual height " + f"({page.imageHeight} != {img.height})") if page.imageWidth != img.width: - self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width ({page.imageWidth} != {img.width})") + self.report.add_error(f"PAGE '{f.ID}': @imageWidth != image's actual width " + f"({page.imageWidth} != {img.width})") if 'imagefilename' in self.page_checks: imageFilename = page.imageFilename if is_local_filename(imageFilename): @@ -344,7 +353,8 @@ def _validate_page(self): if not self.mets.find_files(**kwargs): self.report.add_error(f"PAGE '{f.ID}': imageFilename '{imageFilename}' not found in METS") if is_local_filename(imageFilename) and not Path(imageFilename).exists(): - self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' points to non-existent local file") + self.report.add_warning(f"PAGE '{f.ID}': imageFilename '{imageFilename}' " + "points to non-existent local file") if 'alternativeimage_filename' in self.page_checks: for altimg in page.get_AllAlternativeImages(): if is_local_filename(altimg.filename): @@ -368,8 +378,8 @@ def _validate_page(self): self.report.add_error(f"PAGE '{f.ID}': {altimg.parent_object_.id} AlternativeImage " f"'{altimg.filename}' feature '{feature}' not standardized for PAGE") if 'mets_fileid_page_pcgtsid' in self.page_checks and pcgts.pcGtsId != f.ID: - self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pcgts.pcGtsId or '', f.ID or '')) - + self.report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % ( + pcgts.pcGtsId or '', f.ID or '')) def _validate_page_xsd(self): """ diff --git a/src/ocrd_validators/xsd_mets_validator.py b/src/ocrd_validators/xsd_mets_validator.py index 87e524e17..6f37b0921 100644 --- a/src/ocrd_validators/xsd_mets_validator.py +++ b/src/ocrd_validators/xsd_mets_validator.py @@ -1,6 +1,7 @@ from .xsd_validator import XsdValidator from .constants import XSD_METS_URL + class XsdMetsValidator(XsdValidator): """ XML Schema validator. @@ -14,4 +15,4 @@ def validate(cls, doc): # pylint: disable=arguments-differ Args: doc (etree.ElementTree|str|bytes): """ - return cls.instance(XSD_METS_URL)._validate(doc) # pylint: disable=protected-access + return cls.instance(XSD_METS_URL)._validate(doc) # pylint: disable=protected-access diff --git a/src/ocrd_validators/xsd_page_validator.py b/src/ocrd_validators/xsd_page_validator.py index eec6105f6..3bfad6930 100644 --- a/src/ocrd_validators/xsd_page_validator.py +++ b/src/ocrd_validators/xsd_page_validator.py @@ -1,6 +1,7 @@ from .xsd_validator import XsdValidator from .constants import XSD_PAGE_URL + class XsdPageValidator(XsdValidator): """ XML Schema validator. @@ -14,4 +15,4 @@ def validate(cls, doc): # pylint: disable=arguments-differ Args: doc (etree.ElementTree|str|bytes): """ - return cls.instance(XSD_PAGE_URL)._validate(doc) # pylint: disable=protected-access + return cls.instance(XSD_PAGE_URL)._validate(doc) # pylint: disable=protected-access diff --git a/src/ocrd_validators/xsd_validator.py b/src/ocrd_validators/xsd_validator.py index 92e450212..247d3d84a 100644 --- a/src/ocrd_validators/xsd_validator.py +++ b/src/ocrd_validators/xsd_validator.py @@ -9,6 +9,7 @@ from .constants import XSD_PATHS + # # ------------------------------------------------- # @@ -35,7 +36,7 @@ def validate(cls, schema_url, doc): doc (etree.ElementTree|str|bytes): schema_url (str): URI of XML schema to validate against. """ - return cls.instance(schema_url)._validate(doc) # pylint: disable=protected-access + return cls.instance(schema_url)._validate(doc) # pylint: disable=protected-access def __init__(self, schema_url): """ @@ -55,7 +56,8 @@ def _validate(self, doc): Do the actual validation. Arguments: - doc (etree.ElementTree|str|bytes|pathlib.Path): the document. if etree: us as-is. if str/bytes: parse as XML string. If Path: read_text on it + doc (etree.ElementTree|str|bytes|pathlib.Path): the document. + (If etree: us as-is. If str/bytes: parse as XML string. If Path: read_text on it.) Returns: ValidationReport """