Merge pull request #243 from socialcopsdev/add-table-regions

[MRG] Add table regions support
atlanhq · Jan 4, 2019 · 7cf409a · 7cf409a
2 parents a5027e8 + 302a506
commit 7cf409a
Show file tree

Hide file tree

Showing 13 changed files with 162 additions and 60 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -4,6 +4,13 @@ Release History
 master
 ------
 
+**Improvements**
+
+* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
+    * You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
+    * Kwarg `line_size_scaling` is now called `line_scale`.
+* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.
+
 0.6.0 (2018-12-24)
 ------------------
 

diff --git a/camelot/cli.py b/camelot/cli.py
@@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):
 
 
 @cli.command('lattice')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-T', '--table_areas', default=[], multiple=True,
               help='Table areas to process. Example: x1,y1,x2,y2'
               ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-back', '--process_background', is_flag=True,
               help='Process background lines.')
-@click.option('-scale', '--line_size_scaling', default=15,
+@click.option('-scale', '--line_scale', default=15,
               help='Line size scaling factor. The larger the value,'
               ' the smaller the detected lines.')
 @click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
@@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
     filepath = kwargs.pop('filepath')
     kwargs.update(conf)
 
+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
     table_areas = list(kwargs['table_areas'])
     kwargs['table_areas'] = None if not table_areas else table_areas
     copy_text = list(kwargs['copy_text'])
@@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):
 
 
 @cli.command('stream')
+@click.option('-R', '--table_regions', default=[], multiple=True,
+              help='Page regions to analyze. Example: x1,y1,x2,y2'
+              ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
 @click.option('-T', '--table_areas', default=[], multiple=True,
               help='Table areas to process. Example: x1,y1,x2,y2'
               ' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
     filepath = kwargs.pop('filepath')
     kwargs.update(conf)
 
+    table_regions = list(kwargs['table_regions'])
+    kwargs['table_regions'] = None if not table_regions else table_regions
     table_areas = list(kwargs['table_areas'])
     kwargs['table_areas'] = None if not table_areas else table_areas
     columns = list(kwargs['columns'])

diff --git a/camelot/image_processing.py b/camelot/image_processing.py
@@ -48,17 +48,22 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
     return img, threshold
 
 
-def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
+def find_lines(threshold, regions=None, direction='horizontal',
+               line_scale=15, iterations=0):
     """Finds horizontal and vertical lines by applying morphological
     transformations on an image.
 
     Parameters
     ----------
     threshold : object
         numpy.ndarray representing the thresholded image.
+    regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in image coordinate space.
     direction : string, optional (default: 'horizontal')
         Specifies whether to find vertical or horizontal lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
         Factor by which the page dimensions will be divided to get
         smallest length of lines that should be detected.
 
@@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
     lines = []
 
     if direction == 'vertical':
-        size = threshold.shape[0] // line_size_scaling
+        size = threshold.shape[0] // line_scale
         el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
     elif direction == 'horizontal':
-        size = threshold.shape[1] // line_size_scaling
+        size = threshold.shape[1] // line_scale
         el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
     elif direction is None:
         raise ValueError("Specify direction as either 'vertical' or"
                          " 'horizontal'")
 
+    if regions is not None:
+        region_mask = np.zeros(threshold.shape)
+        for region in regions:
+            x, y, w, h = region
+            region_mask[y : y + h, x : x + w] = 1
+        threshold = np.multiply(threshold, region_mask)
+
     threshold = cv2.erode(threshold, el)
     threshold = cv2.dilate(threshold, el)
     dmask = cv2.dilate(threshold, el, iterations=iterations)
 
     try:
         _, contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     except ValueError:
         # for opencv backward compatibility
         contours, _ = cv2.findContours(
-            threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
     for c in contours:
         x, y, w, h = cv2.boundingRect(c)
@@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
     return dmask, lines
 
 
-def find_table_contours(vertical, horizontal):
+def find_contours(vertical, horizontal):
     """Finds table boundaries using OpenCV's findContours.
 
     Parameters
@@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):
 
     try:
         __, contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     except ValueError:
         # for opencv backward compatibility
         contours, __ = cv2.findContours(
-            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    # sort in reverse based on contour area and use first 10 contours
     contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
 
     cont = []
@@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
     return cont
 
 
-def find_table_joints(contours, vertical, horizontal):
+def find_joints(contours, vertical, horizontal):
     """Finds joints/intersections present inside each table boundary.
 
     Parameters
@@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
         and (x2, y2) -> rt in image coordinate space.
 
     """
-    joints = np.bitwise_and(vertical, horizontal)
+    joints = np.multiply(vertical, horizontal)
     tables = {}
     for c in contours:
         x, y, w, h = c
         roi = joints[y : y + h, x : x + w]
         try:
             __, jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
         except ValueError:
             # for opencv backward compatibility
             jc, __ = cv2.findContours(
-                roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+                roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
         if len(jc) <= 4:  # remove contours with less than 4 joints
             continue
         joint_coords = []

diff --git a/camelot/io.py b/camelot/io.py
@@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
         to generate columns.
     process_background* : bool, optional (default: False)
         Process background lines.
-    line_size_scaling* : int, optional (default: 15)
+    line_scale* : int, optional (default: 15)
         Line size scaling factor. The larger the value the smaller
         the detected lines. Making it very large will lead to text
         being detected as lines.

diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -16,7 +16,7 @@
                      merge_close_lines, get_table_index, compute_accuracy,
                      compute_whitespace)
 from ..image_processing import (adaptive_threshold, find_lines,
-                                find_table_contours, find_table_joints)
+                                find_contours, find_joints)
 
 
 logger = logging.getLogger('camelot')
@@ -28,13 +28,17 @@ class Lattice(BaseParser):
 
     Parameters
     ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     table_areas : list, optional (default: None)
         List of table area strings of the form x1,y1,x2,y2
         where (x1, y1) -> left-top and (x2, y2) -> right-bottom
         in PDF coordinate space.
     process_background : bool, optional (default: False)
         Process background lines.
-    line_size_scaling : int, optional (default: 15)
+    line_scale : int, optional (default: 15)
         Line size scaling factor. The larger the value the smaller
         the detected lines. Making it very large will lead to text
         being detected as lines.
@@ -77,14 +81,15 @@ class Lattice(BaseParser):
         Resolution used for PDF to PNG conversion.
 
     """
-    def __init__(self, table_areas=None, process_background=False,
-                 line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
+    def __init__(self, table_regions=None, table_areas=None, process_background=False,
+                 line_scale=15, copy_text=None, shift_text=['l', 't'],
                  split_text=False, flag_size=False, strip_text='', line_tol=2,
                  joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
                  iterations=0, resolution=300, **kwargs):
+        self.table_regions = table_regions
         self.table_areas = table_areas
         self.process_background = process_background
-        self.line_size_scaling = line_size_scaling
+        self.line_scale = line_scale
         self.copy_text = copy_text
         self.shift_text = shift_text
         self.split_text = split_text
@@ -227,9 +232,22 @@ class GhostscriptNotFound(Exception): pass
             stderr=subprocess.STDOUT)
 
     def _generate_table_bbox(self):
+        def scale_areas(areas):
+            scaled_areas = []
+            for area in areas:
+                x1, y1, x2, y2 = area.split(",")
+                x1 = float(x1)
+                y1 = float(y1)
+                x2 = float(x2)
+                y2 = float(y2)
+                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
+                scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
+            return scaled_areas
+
         self.image, self.threshold = adaptive_threshold(
             self.imagename, process_background=self.process_background,
             blocksize=self.threshold_blocksize, c=self.threshold_constant)
+
         image_width = self.image.shape[1]
         image_height = self.image.shape[0]
         image_width_scaler = image_width / float(self.pdf_width)
@@ -239,27 +257,30 @@ def _generate_table_bbox(self):
         image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
         pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)
 
-        vertical_mask, vertical_segments = find_lines(
-            self.threshold, direction='vertical',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
-        horizontal_mask, horizontal_segments = find_lines(
-            self.threshold, direction='horizontal',
-            line_size_scaling=self.line_size_scaling, iterations=self.iterations)
+        if self.table_areas is None:
+            regions = None
+            if self.table_regions is not None:
+                regions = scale_areas(self.table_regions)
 
-        if self.table_areas is not None:
-            areas = []
-            for area in self.table_areas:
-                x1, y1, x2, y2 = area.split(",")
-                x1 = float(x1)
-                y1 = float(y1)
-                x2 = float(x2)
-                y2 = float(y2)
-                x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
-                areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
-            table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, regions=regions, direction='vertical',
+                line_scale=self.line_scale, iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, regions=regions, direction='horizontal',
+                line_scale=self.line_scale, iterations=self.iterations)
+
+            contours = find_contours(vertical_mask, horizontal_mask)
+            table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
         else:
-            contours = find_table_contours(vertical_mask, horizontal_mask)
-            table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
+            vertical_mask, vertical_segments = find_lines(
+                self.threshold, direction='vertical', line_scale=self.line_scale,
+                iterations=self.iterations)
+            horizontal_mask, horizontal_segments = find_lines(
+                self.threshold, direction='horizontal', line_scale=self.line_scale,
+                iterations=self.iterations)
+
+            areas = scale_areas(self.table_areas)
+            table_bbox = find_joints(areas, vertical_mask, horizontal_mask)
 
         self.table_bbox_unscaled = copy.deepcopy(table_bbox)
 

diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
@@ -26,6 +26,10 @@ class Stream(BaseParser):
 
     Parameters
     ----------
+    table_regions : list, optional (default: None)
+        List of page regions that may contain tables of the form x1,y1,x2,y2
+        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
+        in PDF coordinate space.
     table_areas : list, optional (default: None)
         List of table area strings of the form x1,y1,x2,y2
         where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@@ -51,9 +55,10 @@ class Stream(BaseParser):
         to generate columns.
 
     """
-    def __init__(self, table_areas=None, columns=None, split_text=False,
+    def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
                  flag_size=False, strip_text='', edge_tol=50, row_tol=2,
                  column_tol=0, **kwargs):
+        self.table_regions = table_regions
         self.table_areas = table_areas
         self.columns = columns
         self._validate_columns()
@@ -275,7 +280,18 @@ def _nurminen_table_detection(self, textlines):
 
     def _generate_table_bbox(self):
         self.textedges = []
-        if self.table_areas is not None:
+        if self.table_areas is None:
+            hor_text = self.horizontal_text
+            if self.table_regions is not None:
+                # filter horizontal text
+                hor_text = []
+                for region in self.table_regions:
+                    x1, y1, x2, y2 = region
+                    region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
+                    hor_text.extend(region_text)
+            # find tables based on nurminen's detection algorithm
+            table_bbox = self._nurminen_table_detection(hor_text)
+        else:
             table_bbox = {}
             for area in self.table_areas:
                 x1, y1, x2, y2 = area.split(",")
@@ -284,9 +300,6 @@ def _generate_table_bbox(self):
                 x2 = float(x2)
                 y2 = float(y2)
                 table_bbox[(x1, y2, x2, y1)] = None
-        else:
-            # find tables based on nurminen's detection algorithm
-            table_bbox = self._nurminen_table_detection(self.horizontal_text)
         self.table_bbox = table_bbox
 
     def _generate_columns_and_rows(self, table_idx, tk):

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -101,7 +101,7 @@ def download_url(url):
 ]
 lattice_kwargs = [
     'process_background',
-    'line_size_scaling',
+    'line_scale',
     'copy_text',
     'shift_text',
     'line_tol',
@@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
     ----------
     bbox : tuple
         Tuple (x1, y1, x2, y2) representing a bounding box where
-        (x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
+        (x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
         space.
     text : List of PDFMiner text objects.
 

diff --git a/docs/_static/csv/table_regions.csv b/docs/_static/csv/table_regions.csv
@@ -0,0 +1,4 @@
+"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
+"18-75","1,00%"
+"76-80","0,50%"
+"81 in poi","0,10%"
diff --git a/docs/_static/pdf/table_regions.pdf b/docs/_static/pdf/table_regions.pdf