For the *table_ocr* package, some customizations were done to the *extract_tables*, *extract_cells*, *ocr_image*, *ocr_to_csv* files to apply them to the input document. The updated files are available in this github repository. The *table_ocr.traineddata* file in the *tessdata* folder of the package also needs to be moved to the *tessdata* folder of the location where Tesseract-OCR is installed.<br/><br/>
The extraction is done in 2 parts - 
1. First all the pages in the PDF file have been converted to images.
2. Pytesseract and OpenCV packages are then used to extract the tables and the texts from the images.
The reason for dividing the process into 2 parts is that it allows the extraction of the text (for the Condition Text column) and the tables as well as the processing of different image files to be done parallelly which would improve the latency of the model (although that is not done in this POC).

In [7]:
from PIL import Image
import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
from pathlib import Path
import os
import cv2
import table_ocr.util
import table_ocr.extract_tables
import table_ocr.extract_cells
import table_ocr.ocr_image
import table_ocr.ocr_to_csv
import pandas as pd
import numpy as np

In [18]:
# reads the text from an image file using OCR (pytesseract package)
def read_text(filename):
    custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
    d = pytesseract.image_to_data(Image.open(os.path.join(OUTPUT_FOLDER, filename)), config=custom_config, output_type=Output.DICT)
    df = pd.DataFrame(d)

    # clean up blanks
    df1 = df[(df.conf != '-1') & (df.text != ' ') & (df.text != '')]
    # sort blocks vertically
    sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
    text = ''
    for block in sorted_blocks:
        curr = df1[df1['block_num'] == block]
        sel = curr[curr.text.str.len() > 3]
        char_w = (sel.width / sel.text.str.len()).mean()
        prev_par, prev_line, prev_left = 0, 0, 0
        
        for ix, ln in curr.iterrows():
            # add new line when necessary
            if prev_par != ln['par_num']:
                text += '\n'
                prev_par = ln['par_num']
                prev_line = ln['line_num']
                prev_left = 0
            elif prev_line != ln['line_num']:
                text += '\n'
                prev_line = ln['line_num']
                prev_left = 0

            added = 0  # num of spaces that should be added
            if ln['left'] / char_w > prev_left + 1:
                added = int((ln['left']) / char_w) - prev_left
                text += ' ' * added 
            text += ln['text'] + ' '
            prev_left += len(ln['text']) + added + 1
        text += '\n'
    return text

In [3]:
# reads the tables from an image file using OpenCV and OCR (table_ocr package), and creates an image without the tables 
def read_tables(image_filename):
    global tables_dict, tables_counter
    image_filepath = os.path.join(OUTPUT_FOLDER, image_filename)
    image_tables = table_ocr.extract_tables.main([image_filepath])
    print("Running {}".format(f"extract_tables.main([{image_filepath}])."))
    print("Extracted the following tables from the image:")
    print(image_tables)
    for image, tables in image_tables:
        print(f"Processing tables for {image}.")
        for table in tables:
            print(f"Processing table {table}.")
            cells = table_ocr.extract_cells.main(table)
            ocr = [
                table_ocr.ocr_image.main(cell, None)
                for cell in cells
            ]
            print("Extracted {} cells from {}".format(len(ocr), table))
            _, rows = table_ocr.ocr_to_csv.text_files_to_csv(ocr)
            if max(map(len, rows)) == len(rows[0]):
                tables_counter += 1
                sheet_name = 'Table' + str(tables_counter) + ' - Page' + image_filename.split('.')[0].split('_')[1]
                tables_dict[sheet_name] = pd.DataFrame(rows[1:], columns=list(rows[0]))

In [19]:
%%time
PDF_file = 'input_document.pdf' # the input document
FIRST_PAGE, LAST_PAGE = 3, 13 # pages to extract
OUTPUT_FOLDER = 'output' # folder containing the outputs
Path(OUTPUT_FOLDER).mkdir(exist_ok=True)

'''
Part #1 : Converting PDF to images
'''
  
# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, dpi=500, first_page=FIRST_PAGE, last_page=LAST_PAGE, thread_count=4)
  
# Iterate through all the pages stored above
for i, page in enumerate(pages):
    filename = 'page_' + str(i + FIRST_PAGE) + '.png'
    # Save the image of the page in system
    page.save(os.path.join(OUTPUT_FOLDER, filename), 'PNG')
  
'''
Part #2 - Recognizing text from the images using OCR
'''

# dictionary containing the key as the sheet name and the value as the contents of the table as a DataFrame
tables_dict, tables_counter = {}, 0 # the counter keeps count of the number of tables

# Creating a text file to write the output
outfile = os.path.join(OUTPUT_FOLDER, 'out_text.txt')
f = open(outfile, 'a')

for i in range(FIRST_PAGE, LAST_PAGE + 1):
    filename = 'page_' + str(i) + '.png'
    
    print('Processing', filename)
    
    print('Reading tables')
    read_tables(filename)     
    
    print('Reading text')
    text = read_text(filename.split('.')[0] + '_text.png')
    f.write(text)
    
f.close()

print(len(tables_dict), 'tables identified.')

# writing the contents of the tables in an Excel workbook called 'model_output.xlsx'
writer = pd.ExcelWriter(os.path.join(OUTPUT_FOLDER, 'model_output.xlsx'), engine='xlsxwriter')
for sheet_name, df in tables_dict.items():
    df.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()

Processing  page_3.png
Reading tables
Running extract_tables.main([output\page_3.png]).
Extracted the following tables from the image:
[('output\\page_3.png', ['output\\page_3\\table-000.png', 'output\\page_3\\table-001.png', 'output\\page_3\\table-002.png', 'output\\page_3\\table-003.png'])]
Processing tables for output\page_3.png.
Processing table output\page_3\table-000.png.
Extracted 27 cells from output\page_3\table-000.png
Processing table output\page_3\table-001.png.
Extracted 8 cells from output\page_3\table-001.png
Processing table output\page_3\table-002.png.
Extracted 36 cells from output\page_3\table-002.png
Processing table output\page_3\table-003.png.
Extracted 12 cells from output\page_3\table-003.png
Reading text
Processing  page_4.png
Reading tables
Running extract_tables.main([output\page_4.png]).
Extracted the following tables from the image:
[('output\\page_4.png', ['output\\page_4\\table-000.png', 'output\\page_4\\table-001.png'])]
Processing tables for output\page

In [20]:
# checking the text output of the model
f = open(os.path.join(OUTPUT_FOLDER, 'out_text.txt'), 'r+')
lines = [line for line in f.readlines()]
f.close()
lines

['\n',
 '                               PERMIT TO OPERATE \n',
 '                         AIR POLLUTION CONTROL FACILITY \n',
 '                              SPECIFIC CONDITIONS \n',
 '\n',
 '            Terra Nitrogen, LP                                   Permit No. 2011-006-TVR2 (M-8) \n',
 '            Verdigris Nitrogen Plant — Claremore Nitric Acid, Ammonia, and UAN Plants \n',
 '\n',
 '              The permittee is authorized to operate in conformity with the specifications submitted to Air \n',
 '              Quality on July 7, 2015, and supplemental information recetved September 3, 2015, April 28, \n',
 '              2016, and January 31, 2018. The Evaluation Memorandum    dated March  12, 2018, explains the \n',
 '              derivation of applicable permit requirements and estimates of emissions; however, it does not \n',
 '              contain limitations or permit requirements. Continuing operations under this permit constitutes \n',
 '              acceptance of, an