Skip to content

Commit

Permalink
Merge pull request #243 from socialcopsdev/add-table-regions
Browse files Browse the repository at this point in the history
[MRG] Add table regions support
  • Loading branch information
vinayak-mehta committed Jan 4, 2019
2 parents a5027e8 + 302a506 commit 7cf409a
Show file tree
Hide file tree
Showing 13 changed files with 162 additions and 60 deletions.
7 changes: 7 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ Release History
master
------

**Improvements**

* [#240](https://github.com/socialcopsdev/camelot/issues/209) Add support to analyze only certain page regions to look for tables. [#243](https://github.com/socialcopsdev/camelot/pull/243) by Vinayak Mehta.
* You can use `table_regions` in `read_pdf()` to specify approximate page regions which may contain tables.
* Kwarg `line_size_scaling` is now called `line_scale`.
* [#239](https://github.com/socialcopsdev/camelot/issues/239) Raise warning if PDF is image-based. [#240](https://github.com/socialcopsdev/camelot/pull/240) by Vinayak Mehta.

0.6.0 (2018-12-24)
------------------

Expand Down
12 changes: 11 additions & 1 deletion camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,15 @@ def cli(ctx, *args, **kwargs):


@cli.command('lattice')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-back', '--process_background', is_flag=True,
help='Process background lines.')
@click.option('-scale', '--line_size_scaling', default=15,
@click.option('-scale', '--line_scale', default=15,
help='Line size scaling factor. The larger the value,'
' the smaller the detected lines.')
@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']),
Expand Down Expand Up @@ -105,6 +108,8 @@ def lattice(c, *args, **kwargs):
filepath = kwargs.pop('filepath')
kwargs.update(conf)

table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
copy_text = list(kwargs['copy_text'])
Expand Down Expand Up @@ -132,6 +137,9 @@ def lattice(c, *args, **kwargs):


@cli.command('stream')
@click.option('-R', '--table_regions', default=[], multiple=True,
help='Page regions to analyze. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
@click.option('-T', '--table_areas', default=[], multiple=True,
help='Table areas to process. Example: x1,y1,x2,y2'
' where x1, y1 -> left-top and x2, y2 -> right-bottom.')
Expand Down Expand Up @@ -160,6 +168,8 @@ def stream(c, *args, **kwargs):
filepath = kwargs.pop('filepath')
kwargs.update(conf)

table_regions = list(kwargs['table_regions'])
kwargs['table_regions'] = None if not table_regions else table_regions
table_areas = list(kwargs['table_areas'])
kwargs['table_areas'] = None if not table_areas else table_areas
columns = list(kwargs['columns'])
Expand Down
39 changes: 26 additions & 13 deletions camelot/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,22 @@ def adaptive_threshold(imagename, process_background=False, blocksize=15, c=-2):
return img, threshold


def find_lines(threshold, direction='horizontal', line_size_scaling=15, iterations=0):
def find_lines(threshold, regions=None, direction='horizontal',
line_scale=15, iterations=0):
"""Finds horizontal and vertical lines by applying morphological
transformations on an image.
Parameters
----------
threshold : object
numpy.ndarray representing the thresholded image.
regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in image coordinate space.
direction : string, optional (default: 'horizontal')
Specifies whether to find vertical or horizontal lines.
line_size_scaling : int, optional (default: 15)
line_scale : int, optional (default: 15)
Factor by which the page dimensions will be divided to get
smallest length of lines that should be detected.
Expand All @@ -83,26 +88,33 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
lines = []

if direction == 'vertical':
size = threshold.shape[0] // line_size_scaling
size = threshold.shape[0] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (1, size))
elif direction == 'horizontal':
size = threshold.shape[1] // line_size_scaling
size = threshold.shape[1] // line_scale
el = cv2.getStructuringElement(cv2.MORPH_RECT, (size, 1))
elif direction is None:
raise ValueError("Specify direction as either 'vertical' or"
" 'horizontal'")

if regions is not None:
region_mask = np.zeros(threshold.shape)
for region in regions:
x, y, w, h = region
region_mask[y : y + h, x : x + w] = 1
threshold = np.multiply(threshold, region_mask)

threshold = cv2.erode(threshold, el)
threshold = cv2.dilate(threshold, el)
dmask = cv2.dilate(threshold, el, iterations=iterations)

try:
_, contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
# for opencv backward compatibility
contours, _ = cv2.findContours(
threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
threshold.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

for c in contours:
x, y, w, h = cv2.boundingRect(c)
Expand All @@ -116,7 +128,7 @@ def find_lines(threshold, direction='horizontal', line_size_scaling=15, iteratio
return dmask, lines


def find_table_contours(vertical, horizontal):
def find_contours(vertical, horizontal):
"""Finds table boundaries using OpenCV's findContours.
Parameters
Expand All @@ -138,11 +150,12 @@ def find_table_contours(vertical, horizontal):

try:
__, contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
# for opencv backward compatibility
contours, __ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# sort in reverse based on contour area and use first 10 contours
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]

cont = []
Expand All @@ -153,7 +166,7 @@ def find_table_contours(vertical, horizontal):
return cont


def find_table_joints(contours, vertical, horizontal):
def find_joints(contours, vertical, horizontal):
"""Finds joints/intersections present inside each table boundary.
Parameters
Expand All @@ -176,18 +189,18 @@ def find_table_joints(contours, vertical, horizontal):
and (x2, y2) -> rt in image coordinate space.
"""
joints = np.bitwise_and(vertical, horizontal)
joints = np.multiply(vertical, horizontal)
tables = {}
for c in contours:
x, y, w, h = c
roi = joints[y : y + h, x : x + w]
try:
__, jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
except ValueError:
# for opencv backward compatibility
jc, __ = cv2.findContours(
roi, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
roi.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
if len(jc) <= 4: # remove contours with less than 4 joints
continue
joint_coords = []
Expand Down
2 changes: 1 addition & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice',
to generate columns.
process_background* : bool, optional (default: False)
Process background lines.
line_size_scaling* : int, optional (default: 15)
line_scale* : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
Expand Down
69 changes: 45 additions & 24 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
merge_close_lines, get_table_index, compute_accuracy,
compute_whitespace)
from ..image_processing import (adaptive_threshold, find_lines,
find_table_contours, find_table_joints)
find_contours, find_joints)


logger = logging.getLogger('camelot')
Expand All @@ -28,13 +28,17 @@ class Lattice(BaseParser):
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
process_background : bool, optional (default: False)
Process background lines.
line_size_scaling : int, optional (default: 15)
line_scale : int, optional (default: 15)
Line size scaling factor. The larger the value the smaller
the detected lines. Making it very large will lead to text
being detected as lines.
Expand Down Expand Up @@ -77,14 +81,15 @@ class Lattice(BaseParser):
Resolution used for PDF to PNG conversion.
"""
def __init__(self, table_areas=None, process_background=False,
line_size_scaling=15, copy_text=None, shift_text=['l', 't'],
def __init__(self, table_regions=None, table_areas=None, process_background=False,
line_scale=15, copy_text=None, shift_text=['l', 't'],
split_text=False, flag_size=False, strip_text='', line_tol=2,
joint_tol=2, threshold_blocksize=15, threshold_constant=-2,
iterations=0, resolution=300, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas
self.process_background = process_background
self.line_size_scaling = line_size_scaling
self.line_scale = line_scale
self.copy_text = copy_text
self.shift_text = shift_text
self.split_text = split_text
Expand Down Expand Up @@ -227,9 +232,22 @@ class GhostscriptNotFound(Exception): pass
stderr=subprocess.STDOUT)

def _generate_table_bbox(self):
def scale_areas(areas):
scaled_areas = []
for area in areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
scaled_areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
return scaled_areas

self.image, self.threshold = adaptive_threshold(
self.imagename, process_background=self.process_background,
blocksize=self.threshold_blocksize, c=self.threshold_constant)

image_width = self.image.shape[1]
image_height = self.image.shape[0]
image_width_scaler = image_width / float(self.pdf_width)
Expand All @@ -239,27 +257,30 @@ def _generate_table_bbox(self):
image_scalers = (image_width_scaler, image_height_scaler, self.pdf_height)
pdf_scalers = (pdf_width_scaler, pdf_height_scaler, image_height)

vertical_mask, vertical_segments = find_lines(
self.threshold, direction='vertical',
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal',
line_size_scaling=self.line_size_scaling, iterations=self.iterations)
if self.table_areas is None:
regions = None
if self.table_regions is not None:
regions = scale_areas(self.table_regions)

if self.table_areas is not None:
areas = []
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
x1 = float(x1)
y1 = float(y1)
x2 = float(x2)
y2 = float(y2)
x1, y1, x2, y2 = scale_pdf((x1, y1, x2, y2), image_scalers)
areas.append((x1, y1, abs(x2 - x1), abs(y2 - y1)))
table_bbox = find_table_joints(areas, vertical_mask, horizontal_mask)
vertical_mask, vertical_segments = find_lines(
self.threshold, regions=regions, direction='vertical',
line_scale=self.line_scale, iterations=self.iterations)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, regions=regions, direction='horizontal',
line_scale=self.line_scale, iterations=self.iterations)

contours = find_contours(vertical_mask, horizontal_mask)
table_bbox = find_joints(contours, vertical_mask, horizontal_mask)
else:
contours = find_table_contours(vertical_mask, horizontal_mask)
table_bbox = find_table_joints(contours, vertical_mask, horizontal_mask)
vertical_mask, vertical_segments = find_lines(
self.threshold, direction='vertical', line_scale=self.line_scale,
iterations=self.iterations)
horizontal_mask, horizontal_segments = find_lines(
self.threshold, direction='horizontal', line_scale=self.line_scale,
iterations=self.iterations)

areas = scale_areas(self.table_areas)
table_bbox = find_joints(areas, vertical_mask, horizontal_mask)

self.table_bbox_unscaled = copy.deepcopy(table_bbox)

Expand Down
23 changes: 18 additions & 5 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class Stream(BaseParser):
Parameters
----------
table_regions : list, optional (default: None)
List of page regions that may contain tables of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
in PDF coordinate space.
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
Expand All @@ -51,9 +55,10 @@ class Stream(BaseParser):
to generate columns.
"""
def __init__(self, table_areas=None, columns=None, split_text=False,
def __init__(self, table_regions=None, table_areas=None, columns=None, split_text=False,
flag_size=False, strip_text='', edge_tol=50, row_tol=2,
column_tol=0, **kwargs):
self.table_regions = table_regions
self.table_areas = table_areas
self.columns = columns
self._validate_columns()
Expand Down Expand Up @@ -275,7 +280,18 @@ def _nurminen_table_detection(self, textlines):

def _generate_table_bbox(self):
self.textedges = []
if self.table_areas is not None:
if self.table_areas is None:
hor_text = self.horizontal_text
if self.table_regions is not None:
# filter horizontal text
hor_text = []
for region in self.table_regions:
x1, y1, x2, y2 = region
region_text = text_in_bbox((x1, y2, x2, y1), self.horizontal_text)
hor_text.extend(region_text)
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(hor_text)
else:
table_bbox = {}
for area in self.table_areas:
x1, y1, x2, y2 = area.split(",")
Expand All @@ -284,9 +300,6 @@ def _generate_table_bbox(self):
x2 = float(x2)
y2 = float(y2)
table_bbox[(x1, y2, x2, y1)] = None
else:
# find tables based on nurminen's detection algorithm
table_bbox = self._nurminen_table_detection(self.horizontal_text)
self.table_bbox = table_bbox

def _generate_columns_and_rows(self, table_idx, tk):
Expand Down
4 changes: 2 additions & 2 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def download_url(url):
]
lattice_kwargs = [
'process_background',
'line_size_scaling',
'line_scale',
'copy_text',
'shift_text',
'line_tol',
Expand Down Expand Up @@ -339,7 +339,7 @@ def text_in_bbox(bbox, text):
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in PDFMiner coordinate
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
text : List of PDFMiner text objects.
Expand Down
4 changes: 4 additions & 0 deletions docs/_static/csv/table_regions.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"Età dell’Assicuratoall’epoca del decesso","Misura % dimaggiorazione"
"18-75","1,00%"
"76-80","0,50%"
"81 in poi","0,10%"
Binary file added docs/_static/pdf/table_regions.pdf
Binary file not shown.
Loading

0 comments on commit 7cf409a

Please sign in to comment.