In [1]:
import easyocr

In [2]:
import cv2
import os 
from PIL import Image
import matplotlib.pyplot as plt 
import shutil


In [3]:
def convert_doc_to_images(doc_location):
    multi_read = cv2.imreadmulti(doc_location)
    
    num_pages = len(multi_read[1])
    
    images = []
    
    for i in range(num_pages):
        images.append(multi_read[1][i])
        
    return images 

def save_arrays_as_images(list_arrays):
    
    base_name = "saved_folder/image"
    
    folder_name = base_name.split('/')[0]
    
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
        
    i = 1 
    names = []
    
    for arr in list_arrays:
        name = base_name + str(i) + ".png"
        res = cv2.imwrite(name, arr)
        if not res:
            raise Exception("Image Not Saved!")
            
        names.append(name)
        i+=1
        
    return names 
        

In [4]:

def save_pdf_from_images(img_locations, saved_name):
    
    folder_name = "saved_pdf"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
        
    images = []

    for img_loc in img_locations:
        image = Image.open(img_loc)
        image = image.convert('RGB')
        images.append(image)

    first_element = images.pop(0)
    
    location = folder_name + "/" + saved_name + ".pdf"

    first_element.save(location,save_all=True, append_images=images)
    
    print("pdf saved!")
    return location



##  Load Sample Doc 

In [5]:
# get numpy arrays of docs 
images = convert_doc_to_images("sample_doc.tif")

In [6]:
# get locations of saved images 
locations = save_arrays_as_images(images)

# 1. Easy OCR 

In [7]:
# create an object 
reader = easyocr.Reader(['en'], gpu=False) # this needs to run only once to load the model into memory



Using CPU. Note: This module is much faster with a GPU.


In [44]:
easyocr_result = []

for image_loc in locations:
    result = reader.readtext(image_loc, detail = 0)
    result = "\n".join(result)
    easyocr_result.append(result)

In [46]:
type(easyocr_result)

list

In [47]:
print(easyocr_result[0])

MISSOURI DIVISION OF EMPLOYMENT SECURITY
EAUA
UNEMPLOYMENT INSURANCE TAX
QUARTERLY CONTRIBUTION
MISSOURI EMPLOYER ACCOUNT NO
AUDIT
AND WAGE REPORT
BLOCK
01-40259-0-00
(DO NOT
File onllne al www uinteract labor mO gOv
USE)
3
CALENDAR QUARTER MEAR
YEAR
2021
Dale Paid
EMPLOYER NAME AND ADDRESS
Ist
2nd
X
3rd
4th
Central
MO
Professlonal
Services
Mis#AVEADuNaWN
RERO
322855 . 97
CIO
ATTN
278973.18
2500
E 
McCarty
St _
43882 . 79
Jefferson City
MO
65101
TAXES DUE (Multiply Item 6 by Your Rate)
14
FEDERAL ID NUMBER
43-1800354
InteREST ASSESSMENT DUE TO
00
maling, relum Ihls page with remitlance (o
FEDERAL ADVANCES
Dlvlslon ot Employment Securlty
InTEREST CHARGES OF PER
P0
Oox 888
Jelterson Clty, MO 65102-0888
MONTH IF PAID AFTER
Make check payable lo Division of Employmenl Secunty or
10
LATE REPORT PENALTY CHARGES
pay onlne al www uinleract labor mo gov
See ltem 15 1o Ihe Let))
573 751 1995
11
OUTSTANDING AMOUNTS AS OF
15
THIS REPORT IS DUE BY
07/31/2021
GREATER OF 10% OR $100 PENALTY AFTER
12


In [48]:
print(easyocr_result[1])

MISSOURI DEPARTMENT OF LABOR AND INDUSTRIAL RELATIONS
EAU 10B
DIVISION OF EMPLOYMENT SECURITY
QUARTERLY WAGE REPORT
P0
Box 888
CONTINUATION SHEET
Jefferson City, MO 65102-0888
Type or print_In_ink:
Print employer'& name and account number a5 shown on
CALENDAR QUARTER AND YEAR
Form MODES-4 Quarlerly Contribullon and Wage Report
Central
MO
Professional
Services
1st
2nd
X
3rd
4th
01-40259-0-00
Year
2021
16 Soclal Securlty No
17 Flret name
Mlddle
Last name
18 Totel Wages
19
20
Probatlonary
Probatlonary
Inltlal
Multi-
Proba-
Start Dete
End date
State
Ilonary
498-50-0899
KEITH
M
BRICKEY
23250.00
492-58-4800
DARREL
SMITH
16130
0o1
493-78-9576
DERRICK
F
VETTER
13631
26
489-86-4328
DANNY
KLIETHERMES
8643
13
491-94-5093
RANDY
HOLTMEYER
11734
00
497-80-6308
CURTIS
F
BAX
16227
20
496-84-3728
BRIAN
K
MCMILLIAN
27942
94
494-88-2893
MATTHEW
F
SEALS
9122.50
486-80-6502
GREGORY
HENKE
19053
76
498-78-1879
JOHN
B
ROCKWELL
15472
001
486-66-5548
RANDALL
KIRCHNER
12424
38
493-19-8902
Noah
Kesel
4221
00
491-

### Saving Result

In [49]:
# create a directory if not already 
if not os.path.isdir("saved_results"):
        os.mkdir("saved_results")

In [51]:
# save the result
def save_result_in_txt(name, string):
    with open("saved_results/" + name + ".txt", "w") as text_file:
        text_file.write(string)
        print("Saved")

In [54]:
final_result = "\n".join(easyocr_result)

save_result_in_txt("easy_ocr", final_result)

Saved


# 2. Pytesseract 

In [55]:
import pytesseract

In [56]:
pytesseract.image_to_string(locations[0])

'MISSOURI DIVISION OF EMPLOYMENT SECURITY EAU4\nUNEMPLOYMENT INSURANCE TAX\n\n     \n          \n   \n  \n      \n  \n    \n\nQUARTERLY CONTRIBUTION 2 MISSOURI EMPLOYER ACCOUNT NO AUDIT\nAND WAGE REPORT BLOCK\n\n01-40259-0-00 eonor\n\n3 CALENDAR QUARTER /YEAR YEAR 2021\nyarn? Eee By a eo\ni +H aa\n\na |\n322855.97 +\n\n278973.18 =\n\nRe h i i\n7 TES DUE nl Item 6 Shy: Your Rate) Tot |\nINTEREST ASSESSMENT DUE TO | oof\nFEDERAL ADVANCES\n9 INTEREST CHARGES OF PER | oof |\nMONTH IF PAID AFTER\n10 LATE REPORT PENALTY CHARGES | oof\n‘See Item 15 to the Left\n\n11 OUTSTANDING AMOUNTS AS OF\n\n  \n\nFile online al www uinteract labor mo gov\n\n1 EMPLOYER NAME AND ADDRESS.\nCentral MO Professional Services\n\n   \n \n\ncio\nATTN\n\n2500 E. McCarty St.\n\n  \n\nJefferson City MO 65101\n14 FEDERAL ID NUMBER 43-1800354\n\n \n  \n    \n\n  \n   \n \n    \n\nIf matting, return this page with remittance to\n\nDivision of Employment Security\nPO Box 888\nJetterson City, MO 65102-0888\n\nMake check p

In [61]:
pytesseract_result = []

for image_loc in locations:
    result = pytesseract.image_to_string(image_loc)
    pytesseract_result.append(result)

### Save result 

In [62]:
final_result = "\n".join(pytesseract_result)

save_result_in_txt("pytesseract", final_result)

Saved


# 3. pyocr 

In [63]:
import pyocr 

In [64]:
tools = pyocr.get_available_tools()
if len(tools) == 0:
    print("No OCR tool found")
    sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
# Ex: Will use tool 'libtesseract'

langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))
# Ex: Will use lang 'fra'
# Note that languages are NOT sorted in any way. Please refer
# to the system locale settings for the default language
# to use.


Will use tool 'Tesseract (sh)'
Available languages: eng, osd, snum
Will use lang 'eng'


In [73]:
pyocr_result = []
for image_loc in locations:
    txt = tool.image_to_string(
        Image.open(image_loc),
        lang=lang,
        builder=pyocr.builders.TextBuilder()
    )
    pyocr_result.append(txt)


In [74]:
pyocr_result

['MISSOURI DIVISION OF EMPLOYMENT SECURITY\n\n \n\n \n\n \n\n   \n\n \n\n \n\n \n\n \n\n \n\n \n\nEAU4\nUNEMPLOYMENT INSURANCE TAX\nQUARTERLY CONTRIBUTION 2 MISSOURI EMPLOYER ACCOUNT NO AUDIT\nAND WAGE REPORT BLOCK\n01-40259-0-00 (D0 NOT\nFile online al www uinteract labor mo gov USE)\n3 CALENDAR QUARTER /YEAR YEAR 2021] Date Pad\n1 EMPLOYER NAME AND ADDRESS\nIst and X 3rd 4th\nCentral MO Professional Services OT e ry TT\ncio\nATTN\n_—_ 4 278973.18\n2500 E. McCarty St. — 7\n43882.79\n. +\nJefferson City MO 65101 7 TAXES DUE (Multiply Item 6 by Your Rate)\n14 FEDERALIDNUMBER 431800354 +00\n8 INTEREST ASSESSMENT DUE TO\nIf matting, return this page with remittance to FEDERAL ADVANCES | 90\nDivision of Employment Security 9 INTEREST CHARGES OF PER\nOx -00\nJetterson Clty, MO 65102-0888 MONTH IF PAID AFTER\nMake check payable to Division of Employment Security or 10 LATE REPORT PENALTY CHARGES 00\npay online at www umnteract labor mo gov {See Item 15 to the Left) :\n573 751 1995 11. OUTSTA

### Saved result

In [75]:
final_result = "\n".join(pyocr_result)

save_result_in_txt("pyocr", final_result)

Saved


# 4.Camelot 

In [81]:
import camelot

In [16]:
# convert images to pdf 

pdf = save_pdf_from_images(locations, "doc")

pdf saved!


In [9]:
pdf

'saved_pdf/doc.pdf'

In [82]:
tables = camelot.read_pdf(pdf)



In [83]:
tables

<TableList n=0>

# 5. Keras-ocr (kernel dies in Mac) 

In [90]:
! pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.7.0-cp39-cp39-macosx_10_11_x86_64.whl (207.1 MB)
[K     |████████████████████████████████| 207.1 MB 20 kB/s  eta 0:00:017   |████▎                           | 27.4 MB 2.6 MB/s eta 0:01:09eta 0:00:52
[?25hCollecting tensorflow-estimator<2.8,~=2.7.0rc0
  Downloading tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
[K     |████████████████████████████████| 463 kB 2.0 MB/s eta 0:00:01
[?25hCollecting protobuf>=3.9.2
  Using cached protobuf-3.19.1-cp39-cp39-macosx_10_9_x86_64.whl (1.0 MB)
Collecting termcolor>=1.1.0
  Using cached termcolor-1.1.0-py3-none-any.whl
Collecting wrapt>=1.11.0
  Downloading wrapt-1.13.3-cp39-cp39-macosx_10_9_x86_64.whl (33 kB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting keras-preprocessing>=1.1.1
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting gast<0.5.0,>=0.2.1
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
C

In [None]:
!pip install git+https://github.com/faustomorales/keras-ocr.git#egg=keras-ocr


In [None]:
!pip install keras-ocr

In [None]:
import keras_ocr

In [88]:
pipeline = keras_ocr.pipeline.Pipeline()


NameError: name 'keras_ocr' is not defined

In [None]:
prediction_groups = pipeline.recognize(images)

# 6. Tabula

In [24]:
import tabula

In [27]:
from tabula.io import read_pdf

In [28]:
pdf

'saved_pdf/doc.pdf'

In [31]:
df = read_pdf(pdf)

'pages' argument isn't specified.Will extract only from page 1 by default.
Error from tabula-java:
The operation couldn’t be completed. Unable to locate a Java Runtime.
Please visit http://www.java.com for information on installing Java.





CalledProcessError: Command '['java', '-Djava.awt.headless=true', '-Dfile.encoding=UTF8', '-jar', '/Users/hardikkamboj/anaconda3/envs/ocr_env/lib/python3.9/site-packages/tabula/tabula-1.0.5-jar-with-dependencies.jar', '--guess', '--format', 'JSON', 'saved_pdf/doc.pdf']' returned non-zero exit status 1.