diff --git a/src/bigocrpdf/cli.py b/src/bigocrpdf/cli.py index 8a14adb..b2ba07a 100644 --- a/src/bigocrpdf/cli.py +++ b/src/bigocrpdf/cli.py @@ -414,9 +414,12 @@ def build_parser() -> argparse.ArgumentParser: info_p = sub.add_parser("info", help=_("Show PDF metadata and page count")) info_p.add_argument("input", type=Path, help=_("Input PDF file")) + # --- export subcommands share the same input-PDF help text --- + input_pdf_with_text_help = _("Input PDF file (must have text layer)") + # --- export-odf --- odf_p = sub.add_parser("export-odf", help=_("Export OCR'd PDF as ODF document")) - odf_p.add_argument("input", type=Path, help=_("Input PDF file (must have text layer)")) + odf_p.add_argument("input", type=Path, help=input_pdf_with_text_help) odf_p.add_argument( "-o", "--output", @@ -427,7 +430,7 @@ def build_parser() -> argparse.ArgumentParser: # --- export-txt --- txt_p = sub.add_parser("export-txt", help=_("Export OCR'd PDF as formatted text")) - txt_p.add_argument("input", type=Path, help=_("Input PDF file (must have text layer)")) + txt_p.add_argument("input", type=Path, help=input_pdf_with_text_help) txt_p.add_argument( "-o", "--output", @@ -436,6 +439,22 @@ def build_parser() -> argparse.ArgumentParser: help=_("Output text file (default: same name as input with .txt)"), ) + # --- export-md --- + md_p = sub.add_parser("export-md", help=_("Export OCR'd PDF as Markdown")) + md_p.add_argument("input", type=Path, help=input_pdf_with_text_help) + md_p.add_argument( + "-o", + "--output", + type=Path, + default=None, + help=_("Output Markdown file (default: same name as input with .md)"), + ) + md_p.add_argument( + "--front-matter", + action="store_true", + help=_("Prepend YAML front-matter (title, source, page count, date)."), + ) + # --- edit --- edit_p = sub.add_parser("edit", help=_("Open interactive GUI editor")) edit_p.add_argument("input", type=Path, help=_("PDF file to edit")) @@ -460,11 +479,13 @@ def _cmd_ocr(args: argparse.Namespace, logger: logging.Logger) -> int: scanner_enabled = True # Enable preprocessing master switch if any enhancement is requested - enable_preprocessing = any([ - args.auto_contrast, - args.auto_brightness, - args.denoise, - ]) + enable_preprocessing = any( + [ + args.auto_contrast, + args.auto_brightness, + args.denoise, + ] + ) config = OCRConfig( language=args.language, @@ -718,6 +739,23 @@ def _cmd_export_txt(args: argparse.Namespace, logger: logging.Logger) -> int: return 0 +def _cmd_export_md(args: argparse.Namespace, logger: logging.Logger) -> int: + """Handle the 'export-md' command.""" + from bigocrpdf.utils.tsv_odf_converter import convert_pdf_to_markdown + + if args.output: + md_path = str(args.output) + else: + md_path = str(args.input.with_suffix(".md")) + + logger.info(f"Converting {args.input} → {md_path}") + text = convert_pdf_to_markdown(str(args.input), include_front_matter=args.front_matter) + with open(md_path, "w", encoding="utf-8") as f: + f.write(text) + print(f"Saved: {md_path}") + return 0 + + def _cmd_edit(args: argparse.Namespace, logger: logging.Logger) -> int: """Handle the 'edit' command — launch GUI editor directly.""" try: @@ -978,6 +1016,7 @@ def main(argv: list[str] | None = None) -> int: "info": _cmd_info, "export-odf": _cmd_export_odf, "export-txt": _cmd_export_txt, + "export-md": _cmd_export_md, "edit": _cmd_edit, } diff --git a/src/bigocrpdf/ui/conclusion_export_mixin.py b/src/bigocrpdf/ui/conclusion_export_mixin.py index 5989d29..c6b7757 100644 --- a/src/bigocrpdf/ui/conclusion_export_mixin.py +++ b/src/bigocrpdf/ui/conclusion_export_mixin.py @@ -12,6 +12,9 @@ from bigocrpdf.utils.i18n import _ from bigocrpdf.utils.logger import logger +_EXPORT_FAILED_MSG = _("Export failed") +_NOTIFY_ACTIVE = "notify::active" + class ConclusionExportMixin: """Mixin providing ODF export functionality for the conclusion page.""" @@ -54,7 +57,7 @@ def _show_odf_export_options_dialog(self, file_path: str) -> None: images_row.set_subtitle(_("Embed page images alongside text")) images_row.set_active(init_images) images_row.connect( - "notify::active", + _NOTIFY_ACTIVE, lambda row, _p: self._update_odf_setting( "odf_include_images", row.get_active(), switch_state, "images" ), @@ -66,7 +69,7 @@ def _show_odf_export_options_dialog(self, file_path: str) -> None: open_row.set_subtitle(_("Open file in the default application")) open_row.set_active(init_open) open_row.connect( - "notify::active", + _NOTIFY_ACTIVE, lambda row, _p: self._update_odf_setting( "odf_open_after_export", row.get_active(), switch_state, "open_after" ), @@ -184,7 +187,7 @@ def _on_odf_save_response(self, dialog: Gtk.FileDialog, result, file_path: str) except Exception as e: if "Dismissed" not in str(e): logger.error(f"Error exporting to ODF: {e}") - self.window.show_toast(_("Export failed")) + self.window.show_toast(_EXPORT_FAILED_MSG) def _export_odf_file(self, output_path: str, file_path: str) -> None: """Export content to ODF using the TSV-based formatted converter. @@ -303,7 +306,520 @@ def _report_export_result(self, success: bool, output_path: str) -> None: open_file_with_default_app(output_path) else: - self.window.show_toast(_("Export failed")) + self.window.show_toast(_EXPORT_FAILED_MSG) + + # ── Shared export helpers ───────────────────────────────────────── + + @staticmethod + def _is_user_dismissed(exc: Exception) -> bool: + """Tell whether a Gtk.FileDialog error came from the user closing it.""" + # FileDialog raises a GError whose message starts with "Dismissed by user". + # There is no public symbolic constant in the introspected bindings. + return "Dismissed" in str(exc) + + @staticmethod + def _unique_path(path: str) -> str: + """Return *path*, or ``path (1)``, ``path (2)``, … until it doesn't exist. + + Used by bulk export to avoid silently overwriting a file at the + destination — single-file flows already get FileDialog's native + overwrite confirmation. + """ + if not os.path.exists(path): + return path + stem, ext = os.path.splitext(path) + for n in range(1, 1000): + candidate = f"{stem} ({n}){ext}" + if not os.path.exists(candidate): + return candidate + # Extremely unlikely; fall back to overwrite rather than loop forever. + return path + + def _build_progress_dialog( + self, + title_text: str, + subtitle_text: str, + total: int | None = None, + ): + """Build a standard cancellable progress dialog. + + Returns ``(dialog, update_progress, cancel_event)`` where + ``update_progress(done, name)`` is safe to invoke via ``GLib.idle_add`` + and ``cancel_event`` is a :class:`threading.Event` set when the user + clicks Cancel. + """ + import threading + + cancel_event = threading.Event() + + dialog = Adw.Dialog() + dialog.set_title(title_text) + dialog.set_content_width(360) + dialog.set_can_close(False) + + toolbar_view = Adw.ToolbarView() + header = Adw.HeaderBar() + header.set_show_start_title_buttons(False) + header.set_show_end_title_buttons(False) + toolbar_view.add_top_bar(header) + + box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=16) + box.set_margin_start(32) + box.set_margin_end(32) + box.set_margin_top(24) + box.set_margin_bottom(32) + box.set_halign(Gtk.Align.CENTER) + box.set_valign(Gtk.Align.CENTER) + + spinner = Gtk.Spinner() + spinner.set_size_request(40, 40) + spinner.start() + spinner.set_halign(Gtk.Align.CENTER) + box.append(spinner) + + title_label = Gtk.Label(label=title_text) + title_label.add_css_class("title-4") + title_label.set_halign(Gtk.Align.CENTER) + box.append(title_label) + + subtitle_label = Gtk.Label(label=subtitle_text) + subtitle_label.add_css_class("dim-label") + subtitle_label.set_halign(Gtk.Align.CENTER) + box.append(subtitle_label) + + progress_bar: Gtk.ProgressBar | None = None + if total is not None and total > 0: + progress_bar = Gtk.ProgressBar() + progress_bar.set_fraction(0.0) + box.append(progress_bar) + + cancel_btn = Gtk.Button(label=_("Cancel")) + cancel_btn.add_css_class("destructive-action") + cancel_btn.add_css_class("pill") + cancel_btn.set_halign(Gtk.Align.CENTER) + cancel_btn.set_margin_top(8) + set_a11y_label(cancel_btn, _("Cancel")) + cancel_btn.connect("clicked", lambda _b: cancel_event.set()) + box.append(cancel_btn) + + toolbar_view.set_content(box) + dialog.set_child(toolbar_view) + dialog.present(self.window) + + def update_progress(done: int, name: str) -> bool: + if total: + subtitle_label.set_text(f"{done}/{total} — {name}") + if progress_bar is not None: + progress_bar.set_fraction(done / total) + else: + subtitle_label.set_text(name) + return False + + return dialog, update_progress, cancel_event + + # ── Markdown export ──────────────────────────────────────────────── + + def _show_markdown_export_options_dialog(self, file_path: str) -> None: + """Show export options dialog for Markdown export.""" + dialog = Adw.Dialog() + dialog.set_title(_("Export to Markdown")) + dialog.set_content_width(380) + + toolbar_view = Adw.ToolbarView() + toolbar_view.add_top_bar(Adw.HeaderBar()) + + content_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=24) + content_box.set_margin_start(24) + content_box.set_margin_end(24) + content_box.set_margin_top(12) + content_box.set_margin_bottom(24) + + options_group = Adw.PreferencesGroup() + + settings = self.window.settings + init_fm = getattr(settings, "md_include_front_matter", False) + init_open = getattr(settings, "md_open_after_export", False) + state = {"front_matter": init_fm, "open_after": init_open} + + fm_row = Adw.SwitchRow() + fm_row.set_title(_("Include YAML front-matter")) + fm_row.set_subtitle(_("Adds title, source path, page count and date")) + fm_row.set_active(init_fm) + fm_row.connect( + _NOTIFY_ACTIVE, + lambda row, _p: self._update_md_setting( + "md_include_front_matter", row.get_active(), state, "front_matter" + ), + ) + options_group.add(fm_row) + + open_row = Adw.SwitchRow() + open_row.set_title(_("Open after export")) + open_row.set_subtitle(_("Open file in the default application")) + open_row.set_active(init_open) + open_row.connect( + _NOTIFY_ACTIVE, + lambda row, _p: self._update_md_setting( + "md_open_after_export", row.get_active(), state, "open_after" + ), + ) + options_group.add(open_row) + + content_box.append(options_group) + + btn_content = Adw.ButtonContent() + btn_content.set_icon_name("document-save-symbolic") + btn_content.set_label(_("Export")) + + export_btn = Gtk.Button() + export_btn.set_child(btn_content) + export_btn.add_css_class("suggested-action") + export_btn.add_css_class("pill") + export_btn.set_halign(Gtk.Align.CENTER) + set_a11y_label(export_btn, _("Export")) + export_btn.connect( + "clicked", + lambda _b: self._on_md_export_clicked( + state["front_matter"], state["open_after"], file_path, dialog + ), + ) + content_box.append(export_btn) + + toolbar_view.set_content(content_box) + dialog.set_child(toolbar_view) + dialog.present(self.window) + + def _update_md_setting(self, attr: str, value: bool, state: dict, key: str) -> None: + """Persist a Markdown export setting and update local state.""" + state[key] = value + settings = self.window.settings + setattr(settings, attr, value) + # Persist if the settings object supports it (graceful no-op otherwise). + config = getattr(settings, "_config", None) + if config is not None and hasattr(config, "save"): + config.save() + + def _on_md_export_clicked( + self, + include_front_matter: bool, + open_after: bool, + file_path: str, + options_dialog: Adw.Dialog, + ) -> None: + """Handle the Export button click for Markdown.""" + self._md_include_front_matter = include_front_matter + self._md_open_after = open_after + options_dialog.force_close() + self._show_markdown_file_dialog(file_path) + + def _show_markdown_file_dialog(self, file_path: str) -> None: + """Show file save dialog for Markdown export.""" + from gi.repository import Gio + + save_dialog = Gtk.FileDialog.new() + save_dialog.set_title(_("Export to Markdown")) + save_dialog.set_modal(True) + + base_name = os.path.splitext(os.path.basename(file_path))[0] + save_dialog.set_initial_name(f"{base_name}.md") + + filters = Gio.ListStore.new(Gtk.FileFilter) + md_filter = Gtk.FileFilter() + md_filter.set_name(_("Markdown (*.md)")) + md_filter.add_pattern("*.md") + md_filter.add_pattern("*.markdown") + md_filter.add_mime_type("text/markdown") + filters.append(md_filter) + save_dialog.set_filters(filters) + save_dialog.set_default_filter(md_filter) + + save_dialog.save( + parent=self.window, + cancellable=None, + callback=lambda d, r: self._on_md_save_response(d, r, file_path), + ) + + def _on_md_save_response(self, dialog: Gtk.FileDialog, result, file_path: str) -> None: + """Handle the Markdown save dialog response.""" + try: + file = dialog.save_finish(result) + output_path = file.get_path() + if not output_path.lower().endswith((".md", ".markdown")): + output_path += ".md" + self._export_markdown_file(output_path, file_path) + except Exception as e: + if not self._is_user_dismissed(e): + logger.error(f"Error exporting to Markdown: {e}") + self.window.show_toast(_EXPORT_FAILED_MSG) + + def _export_markdown_file(self, output_path: str, file_path: str) -> None: + """Convert PDF to Markdown in a background thread. + + Mirrors the ODF flow: a cancellable progress dialog stays on screen + until the conversion finishes (or the user cancels) so large PDFs + don't appear to freeze the app. + """ + import threading + + from gi.repository import GLib + + from bigocrpdf.utils.odf_builder import ExportCancelled + + include_fm = getattr(self, "_md_include_front_matter", False) + loading_dialog, _update, cancel_event = self._build_progress_dialog( + _("Exporting to Markdown…"), + os.path.basename(file_path), + ) + + def _do_export() -> None: + from bigocrpdf.utils.tsv_odf_converter import convert_pdf_to_markdown + + success = False + cancelled = False + try: + text = convert_pdf_to_markdown( + file_path, + include_front_matter=include_fm, + cancel_event=cancel_event, + ) + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) + with open(output_path, "w", encoding="utf-8") as fh: + fh.write(text) + success = True + except ExportCancelled: + cancelled = True + logger.info("Markdown export cancelled by user") + except Exception as e: + logger.error(f"Markdown conversion failed: {e}") + + if not success and os.path.exists(output_path): + try: + os.remove(output_path) + except OSError: + pass + + GLib.idle_add( + self._on_md_export_finished, loading_dialog, success, cancelled, output_path + ) + + threading.Thread(target=_do_export, daemon=True).start() + + def _on_md_export_finished( + self, + dialog: Adw.Dialog, + success: bool, + cancelled: bool, + output_path: str, + ) -> bool: + """Report Markdown export result on the main thread.""" + dialog.force_close() + if cancelled: + self.window.show_toast(_("Export cancelled")) + elif success: + self.window.show_toast(_("Exported to {}").format(os.path.basename(output_path))) + if getattr(self, "_md_open_after", False): + from bigocrpdf.utils.pdf_utils import open_file_with_default_app + + open_file_with_default_app(output_path) + else: + self.window.show_toast(_EXPORT_FAILED_MSG) + return False + + # ── Bulk export ──────────────────────────────────────────────────── + + def _create_bulk_export_menu_button(self) -> Gtk.MenuButton: + """Build the export menu shown inside the selection action bar. + + Uses ``Gio.Menu`` + ``Gtk.PopoverMenu`` for native keyboard nav + and accessibility — matches the per-row export button's pattern. + """ + from gi.repository import Gio + + menu_model = Gio.Menu() + menu_model.append(_("OpenDocument (.odt)"), "bulk.odt") + menu_model.append(_("Markdown (.md)"), "bulk.md") + + button = Gtk.MenuButton() + button.set_icon_name("document-save-as-symbolic") + button.set_tooltip_text(_("Export selected files")) + button.add_css_class("suggested-action") + button.set_sensitive(False) + button.set_menu_model(menu_model) + + group = Gio.SimpleActionGroup() + odt_action = Gio.SimpleAction.new("odt", None) + odt_action.connect("activate", lambda *_a: self._bulk_export_selected("odf")) + group.add_action(odt_action) + md_action = Gio.SimpleAction.new("md", None) + md_action.connect("activate", lambda *_a: self._bulk_export_selected("md")) + group.add_action(md_action) + button.insert_action_group("bulk", group) + return button + + def _bulk_export_selected(self, fmt: str) -> None: + """Capture the current selection and pick a destination folder.""" + files = sorted(self._selected_files) + if not files: + return + + from gi.repository import Gio + + dialog = Gtk.FileDialog.new() + dialog.set_title(_("Choose destination folder")) + dialog.set_modal(True) + + def _on_folder_chosen(d: Gtk.FileDialog, result: Gio.AsyncResult) -> None: + try: + folder = d.select_folder_finish(result) + except Exception as e: + if not self._is_user_dismissed(e): + logger.error(f"Folder picker failed: {e}") + return + folder_path = folder.get_path() + if folder_path: + self._run_bulk_export(files, folder_path, fmt) + + dialog.select_folder(parent=self.window, cancellable=None, callback=_on_folder_chosen) + + def _run_bulk_export(self, files: list[str], dest_folder: str, fmt: str) -> None: + """Bulk export entry point — validates the destination and spawns the worker.""" + import threading + + # Cheap early checks so the user gets a clear error instead of + # discovering after every file fails individually. + if not os.path.isdir(dest_folder): + self.window.show_toast(_("Destination folder not found")) + return + if not os.access(dest_folder, os.W_OK): + self.window.show_toast(_("Destination folder is not writable")) + return + + total = len(files) + loading_dialog, update_progress, cancel_event = self._build_progress_dialog( + _("Exporting selected files…"), + f"0/{total}", + total=total, + ) + + threading.Thread( + target=self._bulk_export_worker, + args=(files, dest_folder, fmt, cancel_event, update_progress, loading_dialog), + daemon=True, + ).start() + + _BULK_EXTENSIONS = {"md": ".md", "odf": ".odt"} + + @staticmethod + def _safe_remove(path: str) -> None: + """Best-effort removal of a partial output file.""" + if os.path.exists(path): + try: + os.remove(path) + except OSError: + pass + + def _bulk_convert_one(self, pdf_path: str, out_path: str, fmt: str, cancel_event) -> None: + """Convert *pdf_path* into *out_path* using the requested *fmt*. + + Raises ``ExportCancelled`` if the user cancels mid-file, or any other + converter exception on failure — the caller is responsible for + recording the outcome and cleaning up the partial file. + """ + if fmt == "md": + from bigocrpdf.utils.tsv_odf_converter import convert_pdf_to_markdown + + include_fm = getattr(self.window.settings, "md_include_front_matter", False) + text = convert_pdf_to_markdown( + pdf_path, + include_front_matter=include_fm, + cancel_event=cancel_event, + ) + with open(out_path, "w", encoding="utf-8") as fh: + fh.write(text) + return + + from bigocrpdf.utils.tsv_odf_converter import convert_pdf_to_odf + + include_images = getattr(self.window.settings, "odf_include_images", True) + convert_pdf_to_odf( + pdf_path, + out_path, + include_images=include_images, + cancel_event=cancel_event, + ) + + def _bulk_export_worker( + self, + files: list[str], + dest_folder: str, + fmt: str, + cancel_event, + update_progress, + loading_dialog: Adw.Dialog, + ) -> None: + """Thread body: convert each file and report aggregate results.""" + from gi.repository import GLib + + from bigocrpdf.utils.odf_builder import ExportCancelled + + ext = self._BULK_EXTENSIONS.get(fmt, ".md") + results: dict = {"ok": 0, "failed": [], "saved_paths": []} + + for idx, pdf_path in enumerate(files, start=1): + if cancel_event.is_set(): + break + + basename = os.path.splitext(os.path.basename(pdf_path))[0] + ext + out_path = self._unique_path(os.path.join(dest_folder, basename)) + GLib.idle_add(update_progress, idx, os.path.basename(out_path)) + + try: + self._bulk_convert_one(pdf_path, out_path, fmt, cancel_event) + except ExportCancelled: + # User pressed Cancel mid-file — bail out without recording + # this file as a failure, and clean up the partial output. + self._safe_remove(out_path) + break + except Exception: + logger.exception("Bulk export failed for %s", pdf_path) + results["failed"].append(os.path.basename(pdf_path)) + self._safe_remove(out_path) + else: + results["ok"] += 1 + results["saved_paths"].append(out_path) + + GLib.idle_add( + self._on_bulk_export_finished, + loading_dialog, + results, + cancel_event.is_set(), + dest_folder, + ) + + def _on_bulk_export_finished( + self, + dialog: Adw.Dialog, + results: dict, + cancelled: bool, + dest_folder: str, + ) -> bool: + """Close the progress dialog and report the outcome to the user.""" + dialog.force_close() + ok = results["ok"] + failed_count = len(results["failed"]) + folder_name = os.path.basename(dest_folder) or dest_folder + + if cancelled: + self.window.show_toast( + _("Cancelled — saved {ok} of {total}").format(ok=ok, total=ok + failed_count) + ) + elif failed_count: + self.window.show_toast(_("Saved {ok}; {n} failed").format(ok=ok, n=failed_count)) + else: + self.window.show_toast( + _("Saved {ok} files to {folder}").format(ok=ok, folder=folder_name) + ) + return False def _open_file(self, file_path: str) -> None: """Open a file using the default application. diff --git a/src/bigocrpdf/ui/conclusion_page_builder.py b/src/bigocrpdf/ui/conclusion_page_builder.py index b600441..8277020 100644 --- a/src/bigocrpdf/ui/conclusion_page_builder.py +++ b/src/bigocrpdf/ui/conclusion_page_builder.py @@ -248,20 +248,67 @@ def _create_files_card(self) -> Gtk.Box: files_card.set_margin_top(16) files_card.set_margin_bottom(16) - # Card header + # Header row: title on the left, selection toggle on the right. + header_row = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + header_row.set_margin_top(16) + header_row.set_margin_start(16) + header_row.set_margin_end(16) + files_header = Gtk.Label(label=_("Generated Files")) files_header.add_css_class("heading") files_header.set_halign(Gtk.Align.START) - files_header.set_margin_top(16) - files_header.set_margin_start(16) - files_card.append(files_header) + files_header.set_hexpand(True) + header_row.append(files_header) + + self._selection_toggle_btn = Gtk.ToggleButton() + self._selection_toggle_btn.set_icon_name("object-select-symbolic") + self._selection_toggle_btn.set_tooltip_text(_("Select files for bulk actions")) + self._selection_toggle_btn.add_css_class("flat") + self._selection_toggle_btn.connect("toggled", self._on_selection_toggle_clicked) + header_row.append(self._selection_toggle_btn) + + files_card.append(header_row) # Create scrollable file list scrolled_list = self._create_scrollable_file_list() files_card.append(scrolled_list) + # Bulk-action bar (hidden until selection mode is on). + self._selection_action_bar = self._create_selection_action_bar() + self._selection_action_bar.set_visible(False) + files_card.append(self._selection_action_bar) + return files_card + def _create_selection_action_bar(self) -> Gtk.Box: + """Build the bulk-action bar shown at the bottom of the files card.""" + bar = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + bar.set_margin_start(16) + bar.set_margin_end(16) + bar.set_margin_bottom(16) + + self._selection_count_label = Gtk.Label(label=_("Selected: 0")) + self._selection_count_label.set_halign(Gtk.Align.START) + self._selection_count_label.set_hexpand(True) + self._selection_count_label.add_css_class("dim-label") + bar.append(self._selection_count_label) + + select_all_btn = Gtk.Button(label=_("Select all")) + select_all_btn.add_css_class("flat") + select_all_btn.connect("clicked", lambda _b: self._on_select_all_clicked()) + bar.append(select_all_btn) + + clear_btn = Gtk.Button(label=_("Clear")) + clear_btn.add_css_class("flat") + clear_btn.connect("clicked", lambda _b: self._on_clear_selection_clicked()) + bar.append(clear_btn) + + # The export menu builds itself when there's a selection to act on. + self._bulk_export_button = self._create_bulk_export_menu_button() + bar.append(self._bulk_export_button) + + return bar + def _create_scrollable_file_list(self) -> Gtk.ScrolledWindow: """Create the scrollable file list diff --git a/src/bigocrpdf/ui/conclusion_ui_mixin.py b/src/bigocrpdf/ui/conclusion_ui_mixin.py index 504d263..6d29f34 100644 --- a/src/bigocrpdf/ui/conclusion_ui_mixin.py +++ b/src/bigocrpdf/ui/conclusion_ui_mixin.py @@ -41,6 +41,14 @@ def __init__(self, window: "BigOcrPdfWindow"): self.result_size_change = None self.output_list_box = None + # Bulk-selection state (set by the page builder). + self._selection_toggle_btn: Gtk.ToggleButton | None = None + self._selection_action_bar: Gtk.Box | None = None + self._selection_count_label: Gtk.Label | None = None + self._bulk_export_button: Gtk.MenuButton | None = None + self._selection_mode: bool = False + self._selected_files: set[str] = set() + def update_conclusion_page(self) -> None: """Update the conclusion page with results from OCR processing""" if not self._validate_components(): @@ -277,16 +285,63 @@ def _create_file_row( # Add file statistics self._add_file_statistics_to_row(row, pages, file_size, comparison) - # Add file icon - file_icon = Gtk.Image.new_from_icon_name("x-office-document-symbolic") - row.add_prefix(file_icon) - - # Add action buttons - button_container = self._create_file_action_buttons(output_file) - row.add_suffix(button_container) + if self._selection_mode: + # In selection mode the row is dedicated to picking files; per-row + # action buttons would only get in the way. + check = Gtk.CheckButton() + check.set_active(output_file in self._selected_files) + check.connect("toggled", self._on_row_check_toggled, output_file) + row.add_prefix(check) + else: + file_icon = Gtk.Image.new_from_icon_name("x-office-document-symbolic") + row.add_prefix(file_icon) + button_container = self._create_file_action_buttons(output_file) + row.add_suffix(button_container) return row + # ── Selection mode ──────────────────────────────────────────────── + + def _on_selection_toggle_clicked(self, button: Gtk.ToggleButton) -> None: + """Toggle selection mode on/off and rebuild the file list accordingly.""" + self._selection_mode = button.get_active() + if not self._selection_mode: + self._selected_files.clear() + if self._selection_action_bar is not None: + self._selection_action_bar.set_visible(self._selection_mode) + self._refresh_selection_ui() + self._update_file_list() + + def _on_row_check_toggled(self, check: Gtk.CheckButton, file_path: str) -> None: + """Track selection set as individual rows are toggled.""" + if check.get_active(): + self._selected_files.add(file_path) + else: + self._selected_files.discard(file_path) + self._refresh_selection_ui() + + def _on_select_all_clicked(self) -> None: + """Mark every visible file as selected.""" + for output_file in self.window.settings.processed_files: + if os.path.exists(output_file) and self._is_recent_file(output_file): + self._selected_files.add(output_file) + self._update_file_list() + self._refresh_selection_ui() + + def _on_clear_selection_clicked(self) -> None: + """Drop all selections without leaving selection mode.""" + self._selected_files.clear() + self._update_file_list() + self._refresh_selection_ui() + + def _refresh_selection_ui(self) -> None: + """Sync the action bar label and bulk-export button sensitivity.""" + if self._selection_count_label is not None: + count = len(self._selected_files) + self._selection_count_label.set_text(_("Selected: {count}").format(count=count)) + if self._bulk_export_button is not None: + self._bulk_export_button.set_sensitive(bool(self._selected_files)) + def _add_file_statistics_to_row( self, row: Adw.ActionRow, @@ -354,9 +409,9 @@ def _create_file_action_buttons(self, output_file: str) -> Gtk.Box: text_button = self._create_text_button(output_file) button_container.append(text_button) - # Add export to ODF button - odf_button = self._create_odf_button(output_file) - button_container.append(odf_button) + # Add unified export menu (ODF + Markdown + future formats) + export_button = self._create_export_menu_button(output_file) + button_container.append(export_button) return button_container @@ -393,26 +448,38 @@ def _create_text_button(self, output_file: str) -> Gtk.Button: on_click=lambda: self._show_extracted_text(output_file), ) - def _create_odf_button(self, output_file: str) -> Gtk.Button: - """Create an export to ODF button - - Args: - output_file: Path to the file + def _export_to_odf(self, file_path: str) -> None: + """Export extracted text to ODF file.""" + self._show_odf_export_options_dialog(file_path) - Returns: - A Gtk.Button for exporting to ODF - """ - return create_icon_button( - icon_name="x-office-document-symbolic", - tooltip=_("Save as a document for LibreOffice"), - on_click=lambda: self._export_to_odf(output_file), - ) + def _export_to_markdown(self, file_path: str) -> None: + """Show export options dialog for Markdown export.""" + self._show_markdown_export_options_dialog(file_path) - def _export_to_odf(self, file_path: str) -> None: - """Export extracted text to ODF file + def _create_export_menu_button(self, output_file: str) -> Gtk.MenuButton: + """Unified export menu for a single OCR'd file. - Args: - file_path: Path to the PDF file + Backed by ``Gio.Menu`` + a popover-menu so keyboard navigation + (Up/Down/Enter) and screen-reader semantics come for free. """ - # Show export options dialog first - self._show_odf_export_options_dialog(file_path) + from gi.repository import Gio + + menu_model = Gio.Menu() + menu_model.append(_("OpenDocument (.odt)"), "row.odt") + menu_model.append(_("Markdown (.md)"), "row.md") + + button = Gtk.MenuButton() + button.set_icon_name("document-save-as-symbolic") + button.set_tooltip_text(_("Export to other formats")) + button.add_css_class("flat") + button.set_menu_model(menu_model) + + group = Gio.SimpleActionGroup() + odt_action = Gio.SimpleAction.new("odt", None) + odt_action.connect("activate", lambda *_a: self._export_to_odf(output_file)) + group.add_action(odt_action) + md_action = Gio.SimpleAction.new("md", None) + md_action.connect("activate", lambda *_a: self._export_to_markdown(output_file)) + group.add_action(md_action) + button.insert_action_group("row", group) + return button diff --git a/src/bigocrpdf/utils/tsv_odf_converter.py b/src/bigocrpdf/utils/tsv_odf_converter.py index 376a426..ec7742e 100644 --- a/src/bigocrpdf/utils/tsv_odf_converter.py +++ b/src/bigocrpdf/utils/tsv_odf_converter.py @@ -389,3 +389,194 @@ def convert_pdf_to_text(pdf_path: str) -> str: all_elements = fix_cross_page_breaks(all_elements) return create_text(all_elements) + + +# ── Markdown Generation ── + +# Inline characters that always need escaping. Line-start punctuation like +# '#', '-', '+', '>' and 'N.' lists is handled separately so we don't uglify +# mid-paragraph text (e.g. CPF/phone numbers full of hyphens). +_MD_INLINE_ESCAPE_RE = re.compile(r"([\\`*_\[\]<>|])") +_MD_LINE_START_RE = re.compile(r"^([#\-+>]|\d+\.)") + + +def _escape_md(text: str) -> str: + """Escape Markdown control characters in inline text. + + Escapes characters that have meaning anywhere in a line (``*``, ``_``, + ``[``, ``]``, ``<``, ``>``, ``|``, backticks, backslashes) and, only at + the start of the string, the block-level markers (``#``, ``-``, ``+``, + ``>`` and ordered-list ``N.``). + """ + escaped = _MD_INLINE_ESCAPE_RE.sub(r"\\\1", text) + return _MD_LINE_START_RE.sub(r"\\\1", escaped) + + +def _escape_md_cell(text: str) -> str: + """Escape inline Markdown specials inside a table cell. + + Same rules as :func:`_escape_md` minus the line-start markers (cells are + rendered inline, not at the start of a block) plus an explicit ``|`` + escape so the cell does not break the table. + """ + return _MD_INLINE_ESCAPE_RE.sub(r"\\\1", text) + + +def _format_table_markdown(rows: list[list[str]]) -> list[str]: + """Format table rows as a GitHub-flavored Markdown pipe table.""" + if not rows: + return [] + n_cols = max(len(r) for r in rows) + + def _cell(value: str) -> str: + return _escape_md_cell(value).strip() or " " + + out: list[str] = [] + header = rows[0] + out.append( + "| " + " | ".join(_cell(header[j] if j < len(header) else "") for j in range(n_cols)) + " |" + ) + out.append("|" + "|".join(["---"] * n_cols) + "|") + for row in rows[1:]: + out.append( + "| " + " | ".join(_cell(row[j] if j < len(row) else "") for j in range(n_cols)) + " |" + ) + return out + + +def _yaml_escape(value: str) -> str: + """Escape a value for safe inclusion in a single-line YAML scalar.""" + # Strip control characters (including NUL, newlines, tabs) — they can't + # appear in a single-line YAML scalar regardless of quoting. + sanitized = "".join(c for c in value if ord(c) >= 0x20 or c == " ") + return sanitized.replace("\\", "\\\\").replace('"', '\\"') + + +def _build_front_matter(pdf_path: str, page_count: int) -> list[str]: + """Build YAML front-matter lines for a Markdown export.""" + import datetime + import os + + title = os.path.splitext(os.path.basename(pdf_path))[0] + # UTC so the date doesn't drift across timezones (and tests stay stable). + today = datetime.datetime.now(datetime.timezone.utc).date().isoformat() + return [ + "---", + f'title: "{_yaml_escape(title)}"', + f'source: "{_yaml_escape(os.path.abspath(pdf_path))}"', + f"pages: {page_count}", + f"date: {today}", + 'generator: "bigocrpdf"', + "---", + "", + ] + + +_MD_HEADING_PREFIX = {"heading1": "# ", "heading2": "## ", "heading3": "### "} + + +def _ensure_blank_line(lines: list[str]) -> None: + """Append a blank line unless the previous one already is blank.""" + if lines and lines[-1] != "": + lines.append("") + + +def _emit_heading(lines: list[str], elem: DocElement) -> None: + _ensure_blank_line(lines) + lines.append(_MD_HEADING_PREFIX[elem.kind] + _escape_md(elem.text.strip())) + lines.append("") + + +def _emit_table(lines: list[str], elem: DocElement) -> None: + _ensure_blank_line(lines) + lines.extend(_format_table_markdown(elem.rows)) + lines.append("") + + +def _emit_kv(lines: list[str], elem: DocElement) -> None: + """Bold the key portion (before the first colon) for readability.""" + text = elem.text.strip() + if ":" not in text: + lines.append(_escape_md(text)) + return + key, _sep, value = text.partition(":") + lines.append(f"**{_escape_md(key.strip())}:** {_escape_md(value.strip())}") + + +def _emit_paragraph(lines: list[str], elem: DocElement) -> None: + """Paragraph variants (paragraph, paragraph_indent, paragraph_right, …).""" + _ensure_blank_line(lines) + lines.append(_escape_md(elem.text.strip())) + lines.append("") + + +def _emit_element(lines: list[str], elem: DocElement) -> None: + """Dispatch a single DocElement to the appropriate Markdown emitter.""" + if elem.kind in _MD_HEADING_PREFIX: + _emit_heading(lines, elem) + elif elem.kind == "table": + _emit_table(lines, elem) + elif elem.kind == "kv": + _emit_kv(lines, elem) + else: + _emit_paragraph(lines, elem) + + +def create_markdown(pages_elements: list[list[DocElement]]) -> str: + """Generate Markdown preserving document structure (headings, tables, paragraphs).""" + lines: list[str] = [] + + for page_idx, elements in enumerate(pages_elements): + if page_idx > 0: + _ensure_blank_line(lines) + lines.append("---") + lines.append("") + for elem in elements: + _emit_element(lines, elem) + + while lines and lines[-1] == "": + lines.pop() + + return "\n".join(lines) + "\n" + + +def convert_pdf_to_markdown( + pdf_path: str, + include_front_matter: bool = False, + cancel_event: "threading.Event | None" = None, +) -> str: + """Convert an OCR'd PDF to a structured Markdown document. + + Args: + pdf_path: Path to a PDF containing a text layer (post-OCR). + include_front_matter: If True, prepend YAML front-matter with title, + source path, page count and date — handy for ingesting into + Obsidian/Hugo or as LLM context. + cancel_event: Optional event polled between pages so long batches + stay responsive to a cancel button. Raises + :class:`ExportCancelled` if set mid-conversion, matching the + ODF path's contract. + """ + from bigocrpdf.utils.odf_builder import ExportCancelled + + pages_words = parse_tsv_pages(pdf_path) + if cancel_event is not None and cancel_event.is_set(): + raise ExportCancelled + if not pages_words: + if include_front_matter: + return "\n".join(_build_front_matter(pdf_path, 0)) + "\n" + return "" + + all_elements: list[list[DocElement]] = [] + for page_num in sorted(pages_words.keys()): + if cancel_event is not None and cancel_event.is_set(): + raise ExportCancelled + elements = process_page(pages_words[page_num], page_num) + all_elements.append(elements) + + all_elements = fix_cross_page_breaks(all_elements) + + body = create_markdown(all_elements) + if include_front_matter: + return "\n".join(_build_front_matter(pdf_path, len(all_elements))) + body + return body diff --git a/tests/test_markdown_export.py b/tests/test_markdown_export.py new file mode 100644 index 0000000..66503d9 --- /dev/null +++ b/tests/test_markdown_export.py @@ -0,0 +1,303 @@ +"""Tests for Markdown export in tsv_odf_converter.""" + +import os +import tempfile +import threading +from unittest.mock import patch + +import pytest + +from bigocrpdf.utils.odf_builder import ExportCancelled +from bigocrpdf.utils.tsv_odf_converter import ( + _escape_md, + _escape_md_cell, + _format_table_markdown, + _yaml_escape, + convert_pdf_to_markdown, + create_markdown, +) +from bigocrpdf.utils.tsv_parser import DocElement + + +def _mock_pdf_path(name: str) -> str: + """Build a temp-dir path used as an opaque PDF identifier in mocked tests. + + The file is never actually read or written — ``parse_tsv_pages`` is + mocked — but using the system temp dir instead of a hardcoded ``/tmp`` + keeps static analyzers (and Windows CI) happy. + """ + return os.path.join(tempfile.gettempdir(), name) + + +class TestEscapeMd: + def test_escapes_inline_special_chars(self): + assert _escape_md("a*b_c") == r"a\*b\_c" + assert _escape_md("[link]") == r"\[link\]" + + def test_passthrough_plain_text(self): + assert _escape_md("simple words") == "simple words" + + def test_escapes_pipes_and_backticks(self): + assert _escape_md("`code`") == r"\`code\`" + assert _escape_md("a|b") == r"a\|b" + + def test_does_not_escape_inline_hyphens(self): + # CPF/phone numbers should stay readable + assert _escape_md("CPF 047.986.575-22") == "CPF 047.986.575-22" + assert _escape_md("(71) 99251-8877") == "(71) 99251-8877" + + def test_escapes_block_markers_only_at_line_start(self): + assert _escape_md("# heading") == r"\# heading" + assert _escape_md("- bullet") == r"\- bullet" + assert _escape_md("> quote") == r"\> quote" + assert _escape_md("1. item") == r"\1. item" + # Mid-line '#' stays untouched + assert _escape_md("see #issue") == "see #issue" + + def test_asterisk_at_line_start_inline_escaped(self): + # '*' is inline-escaped first; it isn't a block-level marker in the + # line-start regex, so the inline rule is what kicks in. + assert _escape_md("*emphasized* mid") == r"\*emphasized\* mid" + + +class TestEscapeMdCell: + """Table cells need inline escapes but no line-start rules.""" + + def test_escapes_pipe(self): + assert _escape_md_cell("a|b") == r"a\|b" + + def test_escapes_emphasis_markers(self): + assert _escape_md_cell("**bold**") == r"\*\*bold\*\*" + assert _escape_md_cell("[link](x)") == r"\[link\](x)" + + def test_does_not_apply_line_start_rules(self): + # A cell starting with '-' would be wrongly turned into a list item + # outside a table, but inside the pipe-table cell it's fine. + assert _escape_md_cell("- safe in cell") == "- safe in cell" + + +class TestFormatTableMarkdown: + def test_basic_table(self): + rows = [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]] + out = _format_table_markdown(rows) + assert out[0] == "| Name | Age |" + assert out[1] == "|---|---|" + assert out[2] == "| Alice | 30 |" + assert out[3] == "| Bob | 25 |" + + def test_empty_rows_returns_empty(self): + assert _format_table_markdown([]) == [] + + def test_escapes_pipe_inside_cell(self): + rows = [["h"], ["a|b"]] + out = _format_table_markdown(rows) + assert "a\\|b" in out[2] + + def test_uneven_rows_padded(self): + rows = [["a", "b", "c"], ["x"]] + out = _format_table_markdown(rows) + # 3 columns expected in every row + assert out[0].count("|") == 4 + assert out[2].count("|") == 4 + + def test_escapes_markdown_specials_inside_cells(self): + rows = [["h1"], ["**bold** and *italic*"]] + out = _format_table_markdown(rows) + assert r"\*\*bold\*\* and \*italic\*" in out[2] + + +class TestYamlEscape: + def test_quotes_and_backslashes(self): + assert _yaml_escape('say "hi"') == r"say \"hi\"" + assert _yaml_escape(r"a\b") == r"a\\b" + + def test_strips_control_chars_including_newlines(self): + # Newlines and other control chars would break a single-line scalar. + assert _yaml_escape("a\nb\tc\x00d") == "abcd" + # Regular ASCII space stays. + assert _yaml_escape("a b") == "a b" + + +class TestCreateMarkdown: + def test_headings_emit_hash_prefixes(self): + pages = [ + [ + DocElement("heading1", "Title"), + DocElement("heading2", "Section"), + DocElement("heading3", "Sub"), + ] + ] + md = create_markdown(pages) + assert "# Title" in md + assert "## Section" in md + assert "### Sub" in md + + def test_paragraph_text_emitted(self): + pages = [[DocElement("paragraph", "Hello world.")]] + md = create_markdown(pages) + assert "Hello world." in md + + def test_kv_bolds_key(self): + pages = [[DocElement("kv", "Author: Jane Doe")]] + md = create_markdown(pages) + assert "**Author:**" in md + assert "Jane Doe" in md + + def test_table_renders_as_pipe_table(self): + pages = [[DocElement("table", rows=[["A", "B"], ["1", "2"]])]] + md = create_markdown(pages) + assert "| A | B |" in md + assert "|---|---|" in md + assert "| 1 | 2 |" in md + + def test_pages_separated_by_thematic_break(self): + pages = [ + [DocElement("paragraph", "page one")], + [DocElement("paragraph", "page two")], + ] + md = create_markdown(pages) + assert "page one" in md + assert "page two" in md + assert "\n---\n" in md + + def test_empty_pages_safe(self): + assert create_markdown([]) == "\n" + + def test_paragraph_special_chars_escaped(self): + pages = [[DocElement("paragraph", "use _underscores_ and *stars*")]] + md = create_markdown(pages) + assert r"\_underscores\_" in md + assert r"\*stars\*" in md + + +class TestConvertPdfToMarkdown: + def test_returns_empty_when_no_text(self): + with patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value={}, + ): + assert convert_pdf_to_markdown("/nonexistent.pdf") == "" + + def test_front_matter_when_no_text(self): + with patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value={}, + ): + out = convert_pdf_to_markdown(_mock_pdf_path("my_doc.pdf"), include_front_matter=True) + assert out.startswith("---\n") + assert 'title: "my_doc"' in out + assert "pages: 0" in out + + def test_front_matter_with_content(self): + # Mock the parser + processor so we exercise the front-matter wrapping. + fake_words = {1: ["w1"]} + with ( + patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value=fake_words, + ), + patch( + "bigocrpdf.utils.tsv_odf_converter.process_page", + return_value=[DocElement("paragraph", "body text")], + ), + ): + out = convert_pdf_to_markdown(_mock_pdf_path("sample.pdf"), include_front_matter=True) + assert out.startswith("---\n") + assert 'title: "sample"' in out + assert "pages: 1" in out + assert "body text" in out + + def test_cli_export_md_writes_file(self): + # End-to-end: write Markdown to disk via the public function. + fake_words = {1: ["w1"]} + with ( + patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value=fake_words, + ), + patch( + "bigocrpdf.utils.tsv_odf_converter.process_page", + return_value=[DocElement("heading1", "Hi")], + ), + ): + text = convert_pdf_to_markdown(_mock_pdf_path("x.pdf")) + + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, "out.md") + with open(path, "w", encoding="utf-8") as f: + f.write(text) + with open(path, encoding="utf-8") as f: + assert "# Hi" in f.read() + + def test_cancel_event_set_before_call_raises(self): + event = threading.Event() + event.set() + fake_words = {1: ["w1"], 2: ["w2"]} + with patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value=fake_words, + ): + with pytest.raises(ExportCancelled): + convert_pdf_to_markdown(_mock_pdf_path("x.pdf"), cancel_event=event) + + def test_cancel_event_unset_runs_normally(self): + event = threading.Event() # not set + fake_words = {1: ["w1"]} + with ( + patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value=fake_words, + ), + patch( + "bigocrpdf.utils.tsv_odf_converter.process_page", + return_value=[DocElement("paragraph", "ok")], + ), + ): + text = convert_pdf_to_markdown(_mock_pdf_path("x.pdf"), cancel_event=event) + assert "ok" in text + + def test_front_matter_date_is_utc_iso(self): + # Date should always be ISO yyyy-mm-dd, never None/empty, regardless of TZ. + import re + + with patch( + "bigocrpdf.utils.tsv_odf_converter.parse_tsv_pages", + return_value={}, + ): + out = convert_pdf_to_markdown(_mock_pdf_path("x.pdf"), include_front_matter=True) + m = re.search(r"^date: (\d{4}-\d{2}-\d{2})$", out, re.MULTILINE) + assert m is not None, out + + +class TestUniquePath: + def test_no_conflict_returns_input(self): + from bigocrpdf.ui.conclusion_export_mixin import ConclusionExportMixin + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "doc.md") + assert ConclusionExportMixin._unique_path(target) == target + + def test_auto_suffix_on_conflict(self): + from bigocrpdf.ui.conclusion_export_mixin import ConclusionExportMixin + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "doc.md") + open(target, "w").close() + p1 = ConclusionExportMixin._unique_path(target) + assert p1.endswith("doc (1).md") + open(p1, "w").close() + p2 = ConclusionExportMixin._unique_path(target) + assert p2.endswith("doc (2).md") + + +class TestIsUserDismissed: + def test_dismissed_message_detected(self): + from bigocrpdf.ui.conclusion_export_mixin import ConclusionExportMixin + + assert ConclusionExportMixin._is_user_dismissed(RuntimeError("Dismissed by user")) + + def test_other_errors_pass_through(self): + from bigocrpdf.ui.conclusion_export_mixin import ConclusionExportMixin + + assert not ConclusionExportMixin._is_user_dismissed(RuntimeError("Disk full")) + assert not ConclusionExportMixin._is_user_dismissed(FileNotFoundError("nope"))