camelot-dev · vinayak-mehta · Jul 11, 2021 · Jul 11, 2021 · Jul 11, 2021 · Jul 11, 2021
diff --git a/HISTORY.md b/HISTORY.md
@@ -6,7 +6,7 @@ master
 
 **Improvements**
 
-- Add pdftopng for image conversion and use ghostscript as fallback. [#198](https://github.com/camelot-dev/camelot/pull/198) by Vinayak Mehta.
+- Add support for multiple image conversion backends. [#198](https://github.com/camelot-dev/camelot/pull/198) and [#253](https://github.com/camelot-dev/camelot/pull/253) by Vinayak Mehta.
 - Add markdown export format. [#222](https://github.com/camelot-dev/camelot/pull/222/) by [Lucas Cimon](https://github.com/Lucas-C).
 
 **Documentation**

diff --git a/camelot/backends/ghostscript_backend.py b/camelot/backends/ghostscript_backend.py
@@ -29,8 +29,8 @@ def installed(self):
     def convert(self, pdf_path, png_path, resolution=300):
         if not self.installed():
             raise OSError(
-                "Ghostscript is not installed. Please install it using the instructions"
-                "here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
+                "Ghostscript is not installed. You can install it using the instructions"
+                " here: https://camelot-py.readthedocs.io/en/master/user/install-deps.html"
             )
 
         import ghostscript

diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py
@@ -3,29 +3,29 @@
 from .poppler_backend import PopplerBackend
 from .ghostscript_backend import GhostscriptBackend
 
-backends = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
+BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend}
 
 
 class ImageConversionBackend(object):
     def __init__(self, backend="poppler", use_fallback=True):
-        if backend not in backends.keys():
+        if backend not in BACKENDS.keys():
             raise ValueError(f"Image conversion backend '{backend}' not supported")
 
         self.backend = backend
         self.use_fallback = use_fallback
-        self.fallbacks = list(filter(lambda x: x != backend, backends.keys()))
+        self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys()))
 
     def convert(self, pdf_path, png_path):
         try:
-            converter = backends[self.backend]()
+            converter = BACKENDS[self.backend]()
             converter.convert(pdf_path, png_path)
         except Exception as e:
             import sys
 
             if self.use_fallback:
                 for fallback in self.fallbacks:
                     try:
-                        converter = backends[fallback]()
+                        converter = BACKENDS[fallback]()
                         converter.convert(pdf_path, png_path)
                     except Exception as e:
                         raise type(e)(

diff --git a/camelot/backends/poppler_backend.py b/camelot/backends/poppler_backend.py
@@ -9,7 +9,7 @@ def convert(self, pdf_path, png_path):
         pdftopng_executable = shutil.which("pdftopng")
         if pdftopng_executable is None:
             raise OSError(
-                "pdftopng is not installed. Please install it using the `pip install pdftopng` command."
+                "pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
             )
 
         pdftopng_command = [pdftopng_executable, pdf_path, png_path]

diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -28,7 +28,7 @@
     find_contours,
     find_joints,
 )
-from ..backends import ImageConversionBackend
+from ..backends.image_conversion import BACKENDS
 
 
 logger = logging.getLogger("camelot")
@@ -111,7 +111,7 @@ def __init__(
         threshold_constant=-2,
         iterations=0,
         resolution=300,
-        backend=ImageConversionBackend(),
+        backend="ghostscript",
         **kwargs,
     ):
         self.table_regions = table_regions
@@ -129,7 +129,37 @@ def __init__(
         self.threshold_constant = threshold_constant
         self.iterations = iterations
         self.resolution = resolution
-        self.backend = backend
+        self.backend = Lattice._get_backend(backend)
+
+    @staticmethod
+    def _get_backend(backend):
+        def implements_convert():
+            methods = [
+                method for method in dir(backend) if method.startswith("__") is False
+            ]
+            return "convert" in methods
+
+        if isinstance(backend, str):
+            if backend in BACKENDS.keys():
+                if backend == "ghostscript":
+                    warnings.warn(
+                        "'ghostscript' will be replaced by 'poppler' as the default image conversion"
+                        " backend in v0.12.0. You can try out 'poppler' with backend='poppler'.",
+                        DeprecationWarning
+                    )
+
+                return BACKENDS[backend]()
+            else:
+                raise NotImplementedError(
+                    f"Unknown backend '{backend}' specified. Please use either 'poppler' or 'ghostscript'."
+                )
+        else:
+            if not implements_convert():
+                raise NotImplementedError(
+                    f"'{backend}' must implement a 'convert' method"
+                )
+
+            return backend
 
     @staticmethod
     def _reduce_index(t, idx, shift_text):

diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst
@@ -629,7 +629,14 @@ To deal with such cases, you can tweak PDFMiner's `LAParams kwargs <https://gith
 Use alternate image conversion backends
 ---------------------------------------
 
-When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://github.com/vinayak-mehta/pdftopng>`_ to convert PDF pages to images for line recognition. This should work out of the box on most operating systems. However, if you get an error, you can supply your own image conversion backend to Camelot::
+When using the :ref:`Lattice <lattice>` flavor, Camelot uses ``ghostscript`` to convert PDF pages to images for line recognition. If you face installation issues with ``ghostscript``, you can use an alternate image conversion backend called ``poppler``. You can specify which image conversion backend you want to use with::
+
+    >>> tables = camelot.read_pdf(filename, backend="ghostscript")  # default
+    >>> tables = camelot.read_pdf(filename, backend="poppler")
+
+.. note:: ``poppler`` will be made the default image conversion backend (replacing ``ghostscript``) with ``v0.12.0``.
+
+If you face issues with both ``ghostscript`` and ``poppler``, you can supply your own image conversion backend::
 
     >>> class ConversionBackend(object):
     >>>     def convert(pdf_path, png_path):
@@ -639,13 +646,3 @@ When using the :ref:`Lattice <lattice>` flavor, Camelot uses `pdftopng <https://
     >>>         pass
     >>>
     >>> tables = camelot.read_pdf(filename, backend=ConversionBackend())
-
-.. note:: If image conversion using ``pdftopng`` fails, Camelot falls back to ``ghostscript`` to try image conversion again, and if that fails, it raises an error.
-
-In case you want to be explicit about the image conversion backend that Camelot should use, you can supply them like this::
-
-    >>> from camelot.backends.poppler_backend import PopplerBackend
-    >>> from camelot.backends.ghostscript_backend import GhostscriptBackend
-    >>>
-    >>> tables = camelot.read_pdf(filename, backend=PopplerBackend())
-    >>> tables = camelot.read_pdf(filename, backend=GhostscriptBackend())
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
 
 import os
+import sys
 
+import pytest
 from click.testing import CliRunner
 
 from camelot.cli import cli
@@ -11,6 +13,11 @@
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
 
+skip_on_windows = pytest.mark.skipif(
+    sys.platform.startswith("win"),
+    reason="Ghostscript not installed in Windows test environment",
+)
+
 
 def test_help_output():
     runner = CliRunner()
@@ -26,6 +33,7 @@ def test_help_output():
     )
 
 
+@skip_on_windows
 def test_cli_lattice():
     with TemporaryDirectory() as tempdir:
         infile = os.path.join(testdir, "foo.pdf")
@@ -35,7 +43,7 @@ def test_cli_lattice():
             cli, ["--format", "csv", "--output", outfile, "lattice", infile]
         )
         assert result.exit_code == 0
-        assert result.output == "Found 1 tables\n"
+        assert "Found 1 tables" in result.output
 
         result = runner.invoke(cli, ["--format", "csv", "lattice", infile])
         output_error = "Error: Please specify output file path using --output"

diff --git a/tests/test_common.py b/tests/test_common.py
@@ -3,6 +3,7 @@
 import os
 import sys
 
+import pytest
 import pandas as pd
 from pandas.testing import assert_frame_equal
 
@@ -16,6 +17,11 @@
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
 
+skip_on_windows = pytest.mark.skipif(
+    sys.platform.startswith("win"),
+    reason="Ghostscript not installed in Windows test environment",
+)
+
 
 def test_version_generation():
     version = (0, 7, 3)
@@ -32,6 +38,7 @@ def test_version_generation_with_prerelease_revision():
     )
 
 
+@skip_on_windows
 def test_parsing_report():
     parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
 
@@ -61,10 +68,8 @@ def test_repr_poppler():
     assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
 
 
+@skip_on_windows
 def test_repr_ghostscript():
-    if sys.platform not in ["linux", "darwin"]:
-        return True
-
     filename = os.path.join(testdir, "foo.pdf")
     tables = camelot.read_pdf(
         filename,
@@ -85,10 +90,8 @@ def test_url_poppler():
     assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
 
 
+@skip_on_windows
 def test_url_ghostscript():
-    if sys.platform not in ["linux", "darwin"]:
-        return True
-
     url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
     tables = camelot.read_pdf(
         url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)
@@ -126,10 +129,8 @@ def test_pages_poppler():
     assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=219 x2=165 y2=234>"
 
 
+@skip_on_windows
 def test_pages_ghostscript():
-    if sys.platform not in ["linux", "darwin"]:
-        return True
-
     url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
     tables = camelot.read_pdf(
         url, backend=ImageConversionBackend(backend="ghostscript", use_fallback=False)