# Synopsis

Extracting information from scanned files.

# Words to remember

**warping**

**OCR**

**denoising**

**blurring**

# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from colorama import Back, Fore, Style
from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

import pytesseract

from matplotlib.gridspec import GridSpec
from matplotlib.patches import Circle
from pylab import imread, imshow, imsave
from scipy.stats import pearsonr
from skimage import img_as_float, img_as_ubyte
from skimage.color import rgb2gray
from skimage.filters import rank, threshold_otsu, gaussian
from skimage.measure import find_contours
from skimage.morphology import ( disk, binary_dilation, binary_erosion, 
                                 binary_closing, binary_opening, 
                                 remove_small_holes, remove_small_objects,
                                 flood_fill, )
from skimage.util import random_noise

from skimage.transform import estimate_transform, warp

from Amaral_libraries.my_stats import half_frame
from Amaral_libraries.my_image_library import grayscale_zoom

In [None]:
my_fontsize = 15
data_folder = Path.cwd() / 'Data' / 'Scanned_Images'
results_folder = Path.cwd() / 'Generated_data'

# Load images

We load all images but select a single one for further analysis.


In [None]:
my_images = list( data_folder.glob('*.png') )
print(f"There are {len(my_images)} images in the folder.\n\n")

for i in range(len(my_images)):
    print(str(my_images[i])[124:])


In [None]:
i = 1
plate = imread(my_images[i])
print(f"Image '{i}' has shape {plate.shape}.\n")

imshow(plate);

print(f"That fourth channel is just ones:\n{plate[:10, :10, 3]}")

We do not need the fourth channel, so we will get rid of it.

We will also want to work with a grayscale version of the image.  The question is: 

> **Which grayscale version should we use?**

Let's look at each channel separately besides a conversion to grayscale of the color image...

In [None]:
fig = plt.figure( figsize = (10, 5) )
ax = []
rgb = ['red', 'green', 'blue']


for i in range(3):
    ax.append( fig.add_subplot(2, 2, i+1) )
    ax[-1].text(600, -20, rgb[i], color = rgb[i], fontsize = my_fontsize )
    ax[-1].imshow( plate[:,:,i], cmap = 'gray' )

    
ax.append( fig.add_subplot(2, 2, 4) )
ax[-1].text(600, -20, 'gray', fontsize = my_fontsize )
ax[-1].imshow( rgb2gray(plate[:,:,:3]) , cmap = 'gray' )

plt.tight_layout()


Not surprisingly, as the bars are green, the **green channel** seems to be the one where the text information and the boxes with data we want to extract is more clearly visible.

From now on, we will focus on this channel.

In [None]:
imshow(plate[:,:,1], cmap = 'gray' );
plate[:,:,1]

As before, we will transform to ubytes in order to save resources.

In [None]:
print( f"Maximum of green channel is {plate[:,:,1].max():.3f}, "
       f" minimum is {plate[:,:,1].min():.3f}\n")

# Will call it plate_b for best
#
plate_b = (255 * plate[:,:,1]).astype( np.uint8 )

print( f"Maximum of green channel is {plate_b.max()}, "
       f" minimum is {plate_b.min()}\n")

fig = plt.figure( figsize = (12, 10) )
plt.imshow( plate_b, cmap = 'gray' );

# Correct image perspective

This involves two steps.  First, we will get the coordinates of the 4 corners of the blue screen as accurately as possible.  To this end, we will magnify the region around each corner one at a time, and adjust the center of the zoomed in region until the red dot is located precisely at the corner.

Next, we use the `transform` package to correct the perspective of the image.  To this end, we need to provide new coordinates for the corners of the blue screen.

## Specify coordinates of corners of blue screen

We will use a gray scale version of the image since the zoom in function only operates with gray scale images.  
 

In [None]:
# For distorted image, I know that corners are at: 
#    [[0,0], [900, 30], [950, 460], [40, 400]]

points_interest = [[0,0], [900, 30], [950, 460], [40, 400]]

#If we did not know, then we would start with empty list
#points_interest = [[], [], [], []]

print(points_interest)

**If we do not know the location of the points of interest (corners)**, then you can uncomment the code in next cell and run it until you get all sets of coordinates.

**Change the value of `k` when determining the coordinates of the points of interest with index `k`.** 

In [None]:
# fig = plt.figure( figsize = (10, 6))
# ax = fig.add_subplot(111)

# zoom_factor = 8
# k = 2
# x = 3546
# y = 2788
# zoomed_image, x0, y0 = grayscale_zoom(plate_b, x, y, zoom_factor)


# ax.imshow( zoomed_image, cmap = 'gray', vmin = 0, vmax = 255 )
# ax.plot([zoom_factor*(x-x0)], [lzoom_factor*(y-y0)], 'ro');

# # Update coordinates of corner k
# #
# points_interest[k] = [x, y]
# print(points_interest)

## Correct perspective

We specify the desired coordinates for the corners of the blue screen in such a way that its size and location are approximately preserved.

In order to accomplish this, we **maintain the coordinates of the first corner** and pick the **coordinates of the opposite corner using the largest values of the coordinates from the other corners**.

We then use the original and desired corner coordinates to define a matrix transformation using `transform.estimate_transform`.

Finally, use apply `transform.warp` to correct the perspective of the image. 


In [None]:
print(points_interest)
transformed_points = [[0,0], [1000, 0], [1000, 450], [0, 450]]
print(transformed_points)

tform = estimate_transform( 'projective', np.array(points_interest), 
                            np.array(transformed_points) )


plate_warp = (255 * warp(plate_b, tform.inverse)).astype( np.uint8 )
color_plate_warp = warp(plate[:,:,:3], tform.inverse)

In [None]:
fig = plt.figure(figsize = (12, 12))

ax1 = fig.add_subplot(121)
ax1.imshow(plate)

for point in points_interest:
    ax1.add_patch(Circle(point, 10, facecolor = 'r'))

ax2 = fig.add_subplot(122)
ax2.imshow( plate_warp, cmap = 'gray', vmin = 0, vmax = 255 )
for point in transformed_points:
    ax2.add_patch(Circle(point, 10, facecolor = 'r'));

**Looking good!!!**

In [None]:
plate_corrected = plate_warp
color_plate_corrected = color_plate_warp

fig = plt.figure( figsize = (12, 10) )
# plt.imshow( plate_corrected, cmap = 'gray', vmin = 0, vmax = 255 );
plt.imshow( color_plate_corrected );

## Clean up

In [None]:
del plate
del color_plate_warp

In [None]:
print( color_plate_corrected.dtype, plate_corrected.dtype )

color_plate_corrected = (255 * color_plate_corrected).astype( np.uint8 )

print( color_plate_corrected.dtype, plate_corrected.dtype )

# Extract boxes with data

The figure we processing has one graph box. 

The graph has a grid, but we will use Gaussian filters to remove it so that the graph box and then identify its contour.


## Remove grid lines from graph boxes


In [None]:
zoom_factor = 2
x = 450
y = 650

fig = plt.figure(figsize = (12, 8))
ax1 = fig.add_subplot(121)

zoomed_image, x0, y0 = grayscale_zoom( plate_corrected, x, y, zoom_factor )
ax1.imshow( zoomed_image, cmap = 'gray')

ax2 = fig.add_subplot(122)

sigma = 3
img2 = gaussian( plate_corrected, sigma = (sigma, sigma), 
                 truncate = 3.5, preserve_range = True )

plate_for_boxes = img2 > threshold_otsu(img2)
print(f"The array plate_for_boxes is of type {plate_for_boxes.dtype}.\n")

zoomed_image, x0, y0 = grayscale_zoom( plate_for_boxes, x, y, zoom_factor )
ax2.imshow( zoomed_image, cmap = 'gray' );

plt.tight_layout()

del zoomed_image

In [None]:
fig = plt.figure( figsize = (12, 10) )
plt.imshow(plate_for_boxes, cmap = 'gray');

Pretty cool, don't you think?

## Contours

We can now identify contours and eliminate all that are small.


In [None]:
contours = find_contours(plate_for_boxes)
print(f"The algorithm found {len(contours)} contours.\n")

for j in range(len(contours)-1, -1, -1):
    if len(contours[j]) < 3000:
        contours.pop(j)

print(f"There are {len(contours)} good contours.\n" )


In [None]:
fig = plt.figure( figsize = (12, 10) )
ax = fig.add_subplot(111)

ax.imshow(plate_for_boxes, cmap = 'gray')

# Find coordinates of corners of boxes
#
box_max = []
box_min = []
for n, contour in enumerate(contours):
    ax.plot(contour[:, 1], contour[:, 0], linewidth = 2)
    box_max.append( np.max(contour, axis = 0) )
    box_min.append( np.min(contour, axis = 0) )
    
del contours

We now store the sections of the image with the graph box and with the corresponding text.

In [None]:

graph_box = color_plate_corrected[int(box_min[0][0]):int(box_max[0][0]), 
                                  int(box_min[0][1]):int(box_max[0][1]), :]

text_box = plate_corrected[int(box_min[0][0]):int(box_max[0][0])+50, 
                           :int(box_min[0][1])]

print(f"The array in graph_box is of type {graph_box.dtype}.\n")
print(f"The array in text_box is of type {text_box.dtype}.\n")

In [None]:
del plate_for_boxes
del plate_corrected
del color_plate_corrected

In [None]:
fig = plt.figure( figsize = (10, 10) )
gs = fig.add_gridspec(1, 5)
ax = []

ax.append( fig.add_subplot( gs[0, 0] ) )
ax[-1].imshow( text_box, cmap = 'gray', vmin = 0, vmax = 255 )
    
ax.append( fig.add_subplot( gs[0, 1:] ) )
ax[-1].imshow( graph_box )

plt.tight_layout()


## Save graph box

In [None]:
imsave( results_folder / 'graph_box.png', graph_box )
    

# OCR

`Tesseract` is  a great package for OCR. We will first use it on a sample image distributed with it.


In [None]:
plate_for_ocr = imread( data_folder / '1_python-ocr.jpg')

imshow(plate_for_ocr)

In [None]:
plate_for_ocr = plate_for_ocr.astype( np.uint8 )

results = pytesseract.image_to_data( plate_for_ocr, 
                                     output_type = pytesseract.Output.DICT )


In [None]:
fig = plt.figure( figsize = (15, 10))
ax = fig.add_subplot(111)

ax.imshow(plate_for_ocr, cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])
    
    if conf > 0: 
        ax.hlines([y, y+h], x, x+w, color = 'g')
        ax.vlines([x, x+w], y, y+h, color = 'g')
        ax.text(x, y-10, f"text: {text} ({conf}%)" )
    

**So cool!**

Let is now check if it works as well for our image...

In [None]:
results = pytesseract.image_to_data( text_box, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )
    

So, we can only get one match, and even that one is not great because of being rotated.

Let's see if removing high-confidence text and replacing with white pixels and repeating analysis changes anything with regard to rest of text...

In [None]:
i = 4
x = results['left'][i]
y = results['top'][i]
w = results['width'][i]
h = results['height'][i]
print(w, h)

text_box_1 = copy(text_box)
print(text_box.max())
print(text_box_1.shape)
print(text_box_1[y:y+h, x:x+w].shape)
text_box_1[y:y+h, x:x+w] = 255*np.ones((h, w))

imshow(text_box_1, cmap = 'gray')

In [None]:
results = pytesseract.image_to_data( text_box_1, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box, cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )
    

## Refactoring code for extracting all text from 'noiseless' image 

# What if there is noise in the image?

If there is noise in the image, than `Tesseract` will not work as well.  

To learn how to address this, we will load below a version of our image with some noise added. 


## Useful plotting function

In [None]:
zoom_factor = 2
x_c = 300
y_c = 300

y_lim = 600
x_lim = 500

def plot_sections(my_image, zoom_factor, x_c, y_c, x_lim, y_lim):
    
    fig = plt.figure( figsize = (10, 8) )
    gs = fig.add_gridspec(1, 3)
    ax = []

    ax.append( fig.add_subplot( gs[0, 0]) )
    ax[-1].imshow( my_image[:y_lim,:x_lim], cmap = 'gray');

    ax.append( fig.add_subplot(gs[0, 1:]) )
    zoomed_image, x0, y0 = grayscale_zoom( my_image, x_c, y_c, zoom_factor )
    ax[-1].imshow( zoomed_image, cmap = 'gray')

    plt.tight_layout()
    


In [None]:
i = 2
plate = imread(my_images[i])
print(f"Image '{i}' has shape {plate.shape}.\n")

imshow(plate);

We will quickly repeat all steps up to getting a text box.

In [None]:
plate_b = (255 * plate[:,:,1]).astype( np.uint8 )
sigma = 3
img2 = gaussian( plate_b, sigma = (sigma, sigma), 
                 truncate = 3.5, preserve_range = True )

plate_for_boxes = img2 > threshold_otsu(img2)
contours = find_contours(plate_for_boxes)
print(f"The algorithm found {len(contours)} contours.\n")

for j in range(len(contours)-1, -1, -1):
    if len(contours[j]) < 3000:
        contours.pop(j)

print(f"There are {len(contours)} good contours.\n" )

# Find coordinates of corners of boxes
#
box_max = []
box_min = []
for n, contour in enumerate(contours):
    box_max.append( np.max(contour, axis = 0) )
    box_min.append( np.min(contour, axis = 0) )
    
del img2
del contours
del plate_for_boxes
# del plate_corrected
# del color_plate_corrected

text_box = plate_b[int(box_min[0][0]):int(box_max[0][0])+50, 
                   :int(box_min[0][1])]

print(f"The array in text_box is of type {text_box.dtype}.\n")

imshow( text_box, cmap = 'gray');

## Try OCR on noisy image

In [None]:
results = pytesseract.image_to_data( text_box, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(text_box[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf > 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )
    

## Denoising


In [None]:
plot_sections(text_box, zoom_factor, x_c, y_c, x_lim, y_lim)

We first binarize the image with a high threshold so we don't loose any foreground...

In [None]:
h, w = text_box.shape

threshold = 250
binary_mask = text_box > threshold

plot_sections(binary_mask, zoom_factor, x_c, y_c, x_lim, y_lim)

It seems that the way to go is to remove small dots (which because they have low pixel values are holes)...

**We can use `remove_small_holes` and check what we get.**


In [None]:
binary_wo_small_holes = remove_small_holes( binary_mask, 15 )

plot_sections(binary_wo_small_holes, zoom_factor, x_c, y_c, x_lim, y_lim)

Note bad.  Now we just need to plug those holes in the characters (which, because they have high pixel values are objects)...

**We can use `remove_small_objects`**


In [None]:
binary_clean = remove_small_objects( binary_wo_small_holes, 40 )

plot_sections(binary_clean, zoom_factor, x_c, y_c, x_lim, y_lim)

**Not bad!!!**

We can now use `gaussian` filtering to smooth the characters.

In [None]:
help(gaussian)

In [None]:
vicinity = disk(1)
clean_box = gaussian( binary_clean, 1.53 )

plot_sections( clean_box, zoom_factor, x_c, y_c, x_lim, y_lim)

In [None]:
clean_text = (255 * clean_box).astype( np.uint8 )
print(f"The array in clean_text is of type {clean_text.dtype}.\n")


In [None]:
results = pytesseract.image_to_data( clean_text, 
                                     output_type = pytesseract.Output.DICT )


# Confidence level <0 means that it is likely trash
#
print(results['conf'])
print(results['left'])
print(results['top'])
print(results['text'])

fig = plt.figure()
ax = fig.add_subplot(111)

ax.imshow(clean_text[:,:], cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    
    ax.plot(x, y, 'o', markersize = '10', alpha = 0.5)

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])

    ax.hlines([y, y+h], x, x+w, color = 'g')
    ax.vlines([x, x+w], y, y+h, color = 'g')
    if conf >= 0: 
        ax.text(x, y-10, f"text: {text} ({conf}%)" )
    

# Re-factoring code for denoising text_boxes

