# Synopsis

Extracting information from scanned files.

# Words to remember

**OCR**

**denoising**

**blurring**

# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from colorama import Back, Fore, Style
from copy import copy, deepcopy
from pathlib import Path


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

from matplotlib.gridspec import GridSpec
from matplotlib.patches import Circle
from pylab import imread, imshow, imsave
from scipy.stats import pearsonr
from skimage import img_as_float, img_as_ubyte
from skimage.color import rgb2gray
from skimage.filters import rank, threshold_otsu, gaussian
from skimage.measure import find_contours
from skimage.morphology import ( disk, binary_dilation, binary_erosion, 
                                 binary_closing, binary_opening, 
                                 remove_small_holes, remove_small_objects,
                                 flood_fill, )
from skimage.util import random_noise

from skimage.transform import estimate_transform, warp

from module_libraries.my_stats import half_frame
from module_libraries.image_lib import display_all_channels, grayscale_zoom

my_fontsize = 15
data_folder = Path.cwd() / 'Data' / 'Scanned_Images'
results_folder = Path.cwd() / 'Results'


## Special requirements

This notebook requires a special environment in which `pytesseract` is installed

In [None]:
import pytesseract


# Load images

We load the images we saved earlier.  Recall that `graph_box` was a full color image and that we were focusing on the green channel, whereas `text_box` was a grayscale image.  

When they are saved as `PNG` files, they get layers added.  For simplicity, we just retrieve the green channel.


In [None]:
graph_box = imread( results_folder / 'graph_box_clean.png' )[:,:,1]
graph_box = (255 * graph_box / graph_box.max()).astype( np.uint8 )
print(graph_box.shape, graph_box.max(), graph_box.min())
text_box = imread( results_folder / 'text_box_clean.png' )[:,:,1]
text_box = (255 * text_box / text_box.max()).astype( np.uint8 )
print(text_box.shape, text_box.max(), text_box.min())


In [None]:
fig = plt.figure( figsize = (10, 10) )
gs = fig.add_gridspec(1, 5)
ax = []

ax.append( fig.add_subplot( gs[0, 0] ) )
ax[-1].imshow( text_box, cmap = 'gray', vmin = 0, vmax = 255 )
    
ax.append( fig.add_subplot( gs[0, 1:] ) )
ax[-1].imshow( graph_box, cmap = 'gray', vmin = 0, vmax = 255  )

plt.tight_layout()
plt.show()


# OCR

`Tesseract` is  a great package for OCR. We will first use it on a sample image distributed with it.


In [None]:
plate_for_ocr = imread( data_folder / '1_python-ocr.jpg')

plt.imshow(plate_for_ocr)
plt.show()

In [None]:
plate_for_ocr = plate_for_ocr.astype( np.uint8 )

results = pytesseract.image_to_data( plate_for_ocr, 
                                     output_type = pytesseract.Output.DICT )


In [None]:
fig = plt.figure( figsize = (15, 10))
ax = fig.add_subplot(111)

ax.imshow(plate_for_ocr, cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])
    
    if conf > 0: 
        ax.hlines([y, y+h], x, x+w, color = 'g')
        ax.vlines([x, x+w], y, y+h, color = 'g')
        ax.text(x, y-10, f"text: {text} ({conf}%)" )

plt.show()

**So cool!**

Let is now check if it works as well for our image...

In [None]:
results = pytesseract.image_to_data( text_box, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )

plt.show()

So, we can only get one match, and even that one is not great because of being rotated.

Perhaps because the text does not all have the same orientation, this *confuses* the choices of the algorithm...

Perhaps if we remove that high confidence vertical text, the algorithm will be able to detect the smaller horizontal pieces of text...

**To test this idea, we first remove high-confidence text and replace it with white pixels, and then repeat the prior  analysis.**

In [None]:
# Remove high-confidence text 
#
i = 4
x = results['left'][i]
y = results['top'][i]
w = results['width'][i]
h = results['height'][i]
print(w, h)

text_box_1 = copy(text_box)
print(text_box.max())
print(text_box_1.shape)
print(text_box_1[y:y+h, x:x+w].shape)
text_box_1[y:y+h, x:x+w] = 255*np.ones((h, w))

plt.imshow(text_box_1, cmap = 'gray')
plt.show()

In [None]:
# Re-analyze image
#
results = pytesseract.image_to_data( text_box_1, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box, cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )

plt.show()

## Refactoring code for extracting all text from 'noiseless' image 

You can imagine how we could go in turn, removing each highest confidence detected text one at a  time...

**Time to implement this idea!**

# What if there is noise in the image?

If there is noise in the image, than `Tesseract` will not work as well.  

To learn how to address this, we will load below a version of our image with some noise added. 


## Useful plotting function

In [None]:
zoom_factor = 2
x_c = 300
y_c = 300

y_lim = 600
x_lim = 500

def plot_sections(my_image, zoom_factor, x_c, y_c, x_lim, y_lim):
    
    fig = plt.figure( figsize = (10, 8) )
    gs = fig.add_gridspec(1, 3)
    ax = []

    ax.append( fig.add_subplot( gs[0, 0]) )
    ax[-1].imshow( my_image[:y_lim,:x_lim], cmap = 'gray');

    ax.append( fig.add_subplot(gs[0, 1:]) )
    zoomed_image, x0, y0 = grayscale_zoom( ax[-1], my_image, x_c, y_c, zoom_factor )
    ax[-1].imshow( zoomed_image, cmap = 'gray')

    plt.tight_layout()
    plt.show()
    
    


In [None]:
graph_box = imread( results_folder / 'graph_box_noisy.png' )[:,:,1]
graph_box = (255 * graph_box / graph_box.max()).astype( np.uint8 )
print(graph_box.shape, graph_box.max(), graph_box.min())
text_box = imread( results_folder / 'text_box_noisy.png' )[:,:,1]
text_box = (255 * text_box / text_box.max()).astype( np.uint8 )
print(text_box.shape, text_box.max(), text_box.min())

In [None]:
fig = plt.figure( figsize = (10, 10) )
gs = fig.add_gridspec(1, 5)
ax = []

ax.append( fig.add_subplot( gs[0, 0] ) )
ax[-1].imshow( text_box, cmap = 'gray', vmin = 0, vmax = 255 )
    
ax.append( fig.add_subplot( gs[0, 1:] ) )
ax[-1].imshow( graph_box, cmap = 'gray', vmin = 0, vmax = 255  )

plt.tight_layout()
plt.show()


## Try OCR on noisy image

In [None]:
results = pytesseract.image_to_data( text_box, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )

plt.show()

## Denoising


In [None]:
plot_sections(text_box, zoom_factor, x_c, y_c, x_lim, y_lim)


We first binarize the image with a high threshold so we don't loose any foreground...

In [None]:
h, w = text_box.shape

threshold = 250
binary_mask = text_box > threshold

plot_sections(binary_mask, zoom_factor, x_c, y_c, x_lim, y_lim)

It seems that the way to go is to remove small dots (which because they have low pixel values are holes)...

**We can use `remove_small_holes` and check what we get.**


In [None]:
binary_wo_small_holes = remove_small_holes( binary_mask, 15 )

plot_sections(binary_wo_small_holes, zoom_factor, x_c, y_c, x_lim, y_lim)

Note bad.  Now we just need to plug those holes in the characters (which, because they have high pixel values are objects)...

**We can use `remove_small_objects`**


In [None]:
binary_clean = remove_small_objects( binary_wo_small_holes, 40 )

plot_sections(binary_clean, zoom_factor, x_c, y_c, x_lim, y_lim)

**Not bad!!!**

We can now use `gaussian` filtering to smooth the characters.

In [None]:
help(gaussian)

In [None]:
vicinity = disk(1)
clean_box = gaussian( binary_clean, 1.53 )

plot_sections( clean_box, zoom_factor, x_c, y_c, x_lim, y_lim)

In [None]:
clean_text = (255 * clean_box).astype( np.uint8 )
print(f"The array in clean_text is of type {clean_text.dtype}.\n")


In [None]:
results = pytesseract.image_to_data( clean_text, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(clean_text[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf >= 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )

plt.show()

**Pretty cool, right?!?!?!**

# Re-factoring code for denoising text_boxes



# Next lesson

[click here](nb_06_Extract_data_from_PDFs.ipynb)