In [1]:
%%capture
%pip install PyMuPDF
%pip install pytesseract
%pip install pyocr
%pip install torch torchvision torchaudio
%pip install easyocr

from PIL import Image
import time # << Helper for tracking how long processes take
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure


# Model 1: PyTesseract
import pytesseract

# Model 2: PyOCR (Tesseract)
import pyocr
import pyocr.builders  
tools = pyocr.get_available_tools()
tool = tools[0]

# Model 3: EasyOCR
import easyocr
reader = easyocr.Reader(['en'], gpu = False)

### Model Functions to return Text

In [2]:
def text_from_pytesseract(img):
    start = time.time()
    
    text = pytesseract.image_to_string(img)
    
    text = text.replace('\n', ' ').strip()
    
    return text, time.time() - start

def text_from_pyocr(img):  
    start = time.time()
    
    # Convert to PIL
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    except:
        pass
    im_pil = Image.fromarray(img)
    text = tool.image_to_string(im_pil, lang='eng', builder=pyocr.builders.TextBuilder())
    text = text.replace('\n', ' ').strip()
    return text, time.time() - start

def number_from_pyocr(img):
    start = time.time()
        
    #Convert to PIL
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    except:
        pass
    im_pil = Image.fromarray(img)
    text = tool.image_to_string(im_pil, lang='eng', builder=pyocr.builders.DigitBuilder())

    return text, time.time() - start

def text_from_easyocr(img):
    start = time.time()
    
    try:
        t = reader.readtext(img, paragraph="False")
        text = t[0][1]
    except Exception as e:
        text = ''
        
    return text, time.time() - start

## Simple Comparison of Single Cell Text/Number Detection

In [9]:
import os

def process(model, file, answer):
    
    img = cv2.imread(file)

    if model == 'pytesseract':
        text, speed = text_from_pytesseract(img)
    elif model == 'pyocr':
        text, speed = text_from_pyocr(img)
    elif model == 'easyocr':
        text, speed = text_from_easyocr(img)
    else:
        return
        
    accuracy = text == answer
    return text, speed, accuracy


def get_stats(model):

    answers = []
    results = []
    speeds = []
    accuracy = []

    directory = os.fsencode('test-images/')

    for file in os.listdir(directory):

        filename = os.fsdecode(file)

        if '=' in filename:

            answer = filename.split("=")[1].replace('.png', '')
        
            text, speed, acc = process(model, f'test-images/{filename}', answer)

            answers.append(answer)
            results.append(text)
            speeds.append(speed)
            accuracy.append(acc)
            
    return answers, results, speeds, accuracy  


def get_report(model):
    ans, res, sp, acc = get_stats(model)

    print("RESULTS: ", model)
    print("--------------------------------------------------------------------------------------------------")
    print('O-Answers: ', ans)
    print("--------------------------------------------------------------------------------------------------")
    print('R-Answers:', res)
    print("--------------------------------------------------------------------------------------------------")
    print('Avg Speed:', np.mean(sp))
    print("--------------------------------------------------------------------------------------------------")
    print('Accuracy:', np.mean(acc) * 100)

## Results Breakdown

Below I have run a test on about 30 images captured during my preprocessing that I wanted to use and show now.

#### PyTesseract

The model is fast, but has some trouble with inference. Also, I had to add a line-break and empty space removal for the text to be cleaned.

In [10]:
get_report('pytesseract')

RESULTS:  pytesseract
--------------------------------------------------------------------------------------------------
O-Answers:  ['NGB 23B', '17', '02', '30', '19', 'NGB', 'NGB', '2016', '05', 'AIT', 'ARMY Active Duty', '2011', '26', '01', 'DEP', 'PRESENT', '05', '26', '2011', '17', '18', '64', 'USAR Active', '2012', '13', '2012', '05', '28', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
R-Answers: ['NGB 23B', '', '', '', '', 'NGB', 'NGB', '2016', '', '', 'ARMY Active Duty', '2011', '', '', 'DEP', 'PRESENT', '', '', '2011', '', '', '', 'USAR Active', '2012', '', '2012', '', '', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
Avg Speed: 0.06674017906188964
--------------------------------------------------------------------------------------------------
Accuracy: 46.666666666666664


#### PyOCR

The model is very fast, but is also very accurate. I will continue to test it on full documents later, but I am very happy with its performance. After pre-processing, I had to add a line-break and empty space removal for the text to be cleaned after inference and it's working like a champ!

In [11]:
get_report('pyocr')

RESULTS:  pyocr
--------------------------------------------------------------------------------------------------
O-Answers:  ['NGB 23B', '17', '02', '30', '19', 'NGB', 'NGB', '2016', '05', 'AIT', 'ARMY Active Duty', '2011', '26', '01', 'DEP', 'PRESENT', '05', '26', '2011', '17', '18', '64', 'USAR Active', '2012', '13', '2012', '05', '28', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
R-Answers: ['NGB 23B', '17', '02', '30', '19', 'NGB', 'NGB', '2016', '05', 'AIT', 'ARMY Active Duty', '2011', '26', '01', 'DEP', 'PRESENT', '05', '26', '2011', '17', '18', '64', 'USAR Active', '2012', '13', '2012', '05', '28', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
Avg Speed: 0.06562470595041911
--------------------------------------------------------------------------------------------------
Accuracy: 100.0


#### EasyOCR

This model is just amazing for what I need. It has reliably achieved nearly 100% in all previous testing, even with minimal preprocessing. That's largely thanks to its own organic image processing it does via CRAFT.

The model is slow, but not so slow that I will neglect using it. Additionally, the team has mentioned increasing its speed in versions to come.

In [12]:
get_report('easyocr')

[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


RESULTS:  easyocr
--------------------------------------------------------------------------------------------------
O-Answers:  ['NGB 23B', '17', '02', '30', '19', 'NGB', 'NGB', '2016', '05', 'AIT', 'ARMY Active Duty', '2011', '26', '01', 'DEP', 'PRESENT', '05', '26', '2011', '17', '18', '64', 'USAR Active', '2012', '13', '2012', '05', '28', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
R-Answers: ['NGB 23B', '17', '02', '30', '19', 'NGB', 'NGB', '2016', '05', 'AIT', 'ARMY Active Duty', '2011', '26', '01', 'DEP', 'PRESENT', '05', '26', '2011', '17', '18', '64', 'USAR Active', '2012', '13', '2012', '05', '28', '2011', 'DD4']
--------------------------------------------------------------------------------------------------
Avg Speed: 0.4933610995610555
--------------------------------------------------------------------------------------------------
Accuracy: 100.0
