Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make ghostscript default backend and add support for string keyword arguments #253

Merged
merged 4 commits into from
Jul 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ master

**Improvements**

- Add pdftopng for image conversion and use ghostscript as fallback. [#198](https://github.com/camelot-dev/camelot/pull/198) by Vinayak Mehta.
- Add support for multiple image conversion backends. [#198](https://github.com/camelot-dev/camelot/pull/198) and [#253](https://github.com/camelot-dev/camelot/pull/253) by Vinayak Mehta.
- Add markdown export format. [#222](https://github.com/camelot-dev/camelot/pull/222/) by [Lucas Cimon](https://github.com/Lucas-C).

**Documentation**
Expand Down
4 changes: 2 additions & 2 deletions camelot/backends/ghostscript_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def installed(self):
def convert(self, pdf_path, png_path, resolution=300):
if not self.installed():
raise OSError(
"Ghostscript is not installed. Please install it using the instructions"
"here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
"Ghostscript is not installed. You can install it using the instructions"
" here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
)

import ghostscript
Expand Down
10 changes: 5 additions & 5 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,29 @@
from .poppler_backend import PopplerBackend
from .ghostscript_backend import GhostscriptBackend

backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}


class ImageConversionBackend(object):
def __init__(self, backend="poppler", use_fallback=True):
if backend not in backends.keys():
if backend not in BACKENDS.keys():
raise ValueError(f"Image conversion backend '{backend}' not supported")

self.backend = backend
self.use_fallback = use_fallback
self.fallbacks = list(filter(lambda x: x != backend, backends.keys()))
self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))

def convert(self, pdf_path, png_path):
try:
converter = backends[self.backend]()
converter = BACKENDS[self.backend]()
converter.convert(pdf_path, png_path)
except Exception as e:
import sys

if self.use_fallback:
for fallback in self.fallbacks:
try:
converter = backends[fallback]()
converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path)
except Exception as e:
raise type(e)(
Expand Down
2 changes: 1 addition & 1 deletion camelot/backends/poppler_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def convert(self, pdf_path, png_path):
pdftopng_executable = shutil.which("pdftopng")
if pdftopng_executable is None:
raise OSError(
"pdftopng is not installed. Please install it using the `pip install pdftopng` command."
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
)

pdftopng_command = [pdftopng_executable, pdf_path, png_path]
Expand Down
36 changes: 33 additions & 3 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
find_contours,
find_joints,
)
from ..backends import ImageConversionBackend
from ..backends.image_conversion import BACKENDS


logger = logging.getLogger("camelot")
Expand Down Expand Up @@ -111,7 +111,7 @@ def __init__(
threshold_constant=-2,
iterations=0,
resolution=300,
backend=ImageConversionBackend(),
backend="ghostscript",
**kwargs,
):
self.table_regions = table_regions
Expand All @@ -129,7 +129,37 @@ def __init__(
self.threshold_constant = threshold_constant
self.iterations = iterations
self.resolution = resolution
self.backend = backend
self.backend = Lattice._get_backend(backend)

@staticmethod
def _get_backend(backend):
def implements_convert():
methods = [
method for method in dir(backend) if method.startswith("__") is False
]
return "convert" in methods

if isinstance(backend, str):
if backend in BACKENDS.keys():
if backend == "ghostscript":
warnings.warn(
"'ghostscript' will be replaced by 'poppler' as the default image conversion"
" backend in v0.12.0. You can try out 'poppler' with backend='poppler'.",
DeprecationWarning
)

return BACKENDS[backend]()
else:
raise NotImplementedError(
f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
)
else:
if not implements_convert():
raise NotImplementedError(
f"'{backend}' must implement a 'convert' method"
)

return backend

@staticmethod
def _reduce_index(t, idx, shift_text):
Expand Down
19 changes: 8 additions & 11 deletions docs/user/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,14 @@ To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://gith
Use alternate image conversion backends
---------------------------------------

When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://github.com/vinayak-mehta/pdftopng>`_ to convert PDF pages to images for line recognition. This should work out of the box on most operating systems. However, if you get an error, you can supply your own image conversion backend to Camelot::
When using the :ref:`Lattice <lattice>` flavor, Camelot uses ``ghostscript`` to convert PDF pages to images for line recognition. If you face installation issues with ``ghostscript``, you can use an alternate image conversion backend called ``poppler``. You can specify which image conversion backend you want to use with::

>>> tables = camelot.read_pdf(filename, backend="ghostscript") # default
>>> tables = camelot.read_pdf(filename, backend="poppler")

.. note:: ``poppler`` will be made the default image conversion backend (replacing ``ghostscript``) with ``v0.12.0``.

If you face issues with both ``ghostscript`` and ``poppler``, you can supply your own image conversion backend::

>>> class ConversionBackend(object):
>>> def convert(pdf_path, png_path):
Expand All @@ -639,13 +646,3 @@ When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://
>>> pass
>>>
>>> tables = camelot.read_pdf(filename, backend=ConversionBackend())

.. note:: If image conversion using ``pdftopng`` fails, Camelot falls back to ``ghostscript`` to try image conversion again, and if that fails, it raises an error.

In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this::

>>> from camelot.backends.poppler_backend import PopplerBackend
>>> from camelot.backends.ghostscript_backend import GhostscriptBackend
>>>
>>> tables = camelot.read_pdf(filename, backend=PopplerBackend())
>>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
10 changes: 9 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# -*- coding: utf-8 -*-

import os
import sys

import pytest
from click.testing import CliRunner

from camelot.cli import cli
Expand All @@ -11,6 +13,11 @@
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")

skip_on_windows = pytest.mark.skipif(
sys.platform.startswith("win"),
reason="Ghostscript not installed in Windows test environment",
)


def test_help_output():
runner = CliRunner()
Expand All @@ -26,6 +33,7 @@ def test_help_output():
)


@skip_on_windows
def test_cli_lattice():
with TemporaryDirectory() as tempdir:
infile = os.path.join(testdir, "foo.pdf")
Expand All @@ -35,7 +43,7 @@ def test_cli_lattice():
cli, ["--format", "csv", "--output", outfile, "lattice", infile]
)
assert result.exit_code == 0
assert result.output == "Found 1 tables\n"
assert "Found 1 tables" in result.output

result = runner.invoke(cli, ["--format", "csv", "lattice", infile])
output_error = "Error: Please specify output file path using --output"
Expand Down
19 changes: 10 additions & 9 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import sys

import pytest
import pandas as pd
from pandas.testing import assert_frame_equal

Expand All @@ -16,6 +17,11 @@
testdir = os.path.dirname(os.path.abspath(__file__))
testdir = os.path.join(testdir, "files")

skip_on_windows = pytest.mark.skipif(
sys.platform.startswith("win"),
reason="Ghostscript not installed in Windows test environment",
)


def test_version_generation():
version = (0, 7, 3)
Expand All @@ -32,6 +38,7 @@ def test_version_generation_with_prerelease_revision():
)


@skip_on_windows
def test_parsing_report():
parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}

Expand Down Expand Up @@ -61,10 +68,8 @@ def test_repr_poppler():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"


@skip_on_windows
def test_repr_ghostscript():
if sys.platform not in ["linux", "darwin"]:
return True

filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(
filename,
Expand All @@ -85,10 +90,8 @@ def test_url_poppler():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"


@skip_on_windows
def test_url_ghostscript():
if sys.platform not in ["linux", "darwin"]:
return True

url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
Expand Down Expand Up @@ -126,10 +129,8 @@ def test_pages_poppler():
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"


@skip_on_windows
def test_pages_ghostscript():
if sys.platform not in ["linux", "darwin"]:
return True

url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(
url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
Expand Down
Loading