# Synopsis

Extracting information from scanned files.

# Words to remember

**warping**

**OCR**

**denoising**

**blurring**

# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from colorama import Back, Fore, Style
from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )

## Installing  Tesseract for OCR

To use Tesseract, you have to install it into your computer, and also to install the `Python` package that provides an interface for it.

You can find details of how to do it [here](https://builtin.com/data-science/python-ocr).

In [None]:
#conda install pyTesseract

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pytesseract

from matplotlib.gridspec import GridSpec
from matplotlib.patches import Circle
from pylab import imread, imshow, imsave
from scipy.stats import pearsonr
from skimage import img_as_float, img_as_ubyte
from skimage.color import rgb2gray
from skimage.filters import rank, threshold_otsu, gaussian
from skimage.measure import find_contours
from skimage.morphology import ( disk, binary_dilation, binary_erosion, 
                                 binary_closing, binary_opening, 
                                 remove_small_holes, remove_small_objects,
                                 flood_fill, )

from skimage.transform import estimate_transform, warp

# from skimage import ( transform, color, 
#                       restoration )


from Amaral_libraries.my_stats import half_frame
from Amaral_libraries.my_image_library import grayscale_zoom

In [None]:
my_fontsize = 15
data_folder = Path.cwd() / 'Data' / 'Scanned_Images'
results_folder = Path.cwd() / 'Generated_data'

# Load images

We load all images but select a single one for further analysis.


In [None]:
my_images = list( data_folder.glob('*') )
print(f"There are {len(my_images)} images in the folder.")
print()

plate = imread(my_images[1])
print(f"Selected image has shape {plate.shape}.\n")

imshow(plate);

print(f"That fourth channel is just ones:\n{plate[:10, :10, 3]}")

We do not need the fourth channel, so we will get rid of it.

We will also want to work with a grayscale version of the image.  The question is: 

> **Which grayscale version should we use?**

Let's look at each channel separately besides a conversion to grayscale of the color image...

In [None]:
fig = plt.figure( figsize = (10, 8) )
ax = []
rgb = ['red', 'green', 'blue']


for i in range(3):
    ax.append( fig.add_subplot(2, 2, i+1) )
    ax[-1].text(2000, -100, rgb[i], fontsize = my_fontsize )
    ax[-1].imshow( plate[:,:,i], cmap = 'gray' )

    
ax.append( fig.add_subplot(2, 2, 4) )
ax[-1].text(2000, -100, 'gray', fontsize = my_fontsize )
ax[-1].imshow( rgb2gray(plate[:,:,:3]) , cmap = 'gray' )

plt.tight_layout()


Interestingly, the **blue channel** seems to be the one where the text information and the boxes with data we want to extract is more clearly visible.

From now on, we will focus on this channel.

In [None]:
plate[:,:,2]

As before, we will transform to ubytes in order to save resources.

In [None]:
print( f"Maximum of blue channel is {plate[:,:,2].max():.3f}, "
       f" minimum is {plate[:,:,2].min():.3f}\n")

plate_b = (256 * plate[:,:,2]).astype( np.uint8 )

fig = plt.figure( figsize = (12, 10) )
plt.imshow( plate_b, cmap = 'gray', vmin = 0, vmax = 255 );

# Correct image perspective

This involves two steps.  First, we will get the coordinates of the 4 corners of the blue screen as accurately as possible.  To this end, we will magnify the region around each corner one at a time, and adjust the center of the zoomed in region until the red dot is located precisely at the corner.

Next, we use the `transform` package to correct the perspective of the image.  To this end, we need to provide new coordinates for the corners of the blue screen.

## Specify coordinates of corners of blue screen

We will use a gray scale version of the image since the zoom in function only operates with gray scale images.  

**We will do one corner at a time and change the value of `k` when determining the coordinates of the corner.**  

In [None]:
# For image[1] corners are at: 
#    [[614, 445], [3620, 548], [3546, 2788], [547, 2715]]

points_interest = [[614, 445], [3620, 548], [3546, 2788], [547, 2715]]
#points_interest = [[], [], [], []]

print(points_interest)

In [None]:
# fig = plt.figure( figsize = (10, 6))
# ax = fig.add_subplot(111)

# zoom_factor = 8
# k = 2
# x = 3546
# y = 2788
# zoomed_image, x0, y0 = grayscale_zoom(plate_b, x, y, zoom_factor)


# ax.imshow( zoomed_image, cmap = 'gray', vmin = 0, vmax = 255 )
# ax.plot([zoom_factor*(x-x0)], [lzoom_factor*(y-y0)], 'ro');

# # Update coordinates of corner k
# #
# points_interest[k] = [x, y]
# print(points_interest)

## Correct perspective

We specify the desired coordinates for the corners of the blue screen in such a way that its size and location are approximately preserved.

In order to accomplish this, we **maintain the coordinates of the first corner** and pick the **coordinates of the opposite corner using the largest values of the coordinates from the other corners**.

We then use the original and desired corner coordinates to define a matrix transformation using `transform.estimate_transform`.

Finally, use apply `transform.warp` to correct the perspective of the image. 


In [None]:
print(points_interest)
transformed_points = [[614,445], [3620, 445], [3620, 2788], [614, 2788]]
print(transformed_points)

tform = estimate_transform( 'projective', np.array(points_interest), 
                            np.array(transformed_points) )


plate_warp = (255 * warp(plate_b, tform.inverse)).astype( np.uint8 )
color_plate_warp = warp(plate[:,:,:3], tform.inverse)

In [None]:
fig = plt.figure(figsize = (12, 12))

ax1 = fig.add_subplot(121)
ax1.imshow(plate)

for point in points_interest:
    ax1.add_patch(Circle(point, 10, facecolor = 'r'))

ax2 = fig.add_subplot(122)
ax2.imshow( plate_warp, cmap = 'gray', vmin = 0, vmax = 255 )
for point in transformed_points:
    ax2.add_patch(Circle(point, 10, facecolor = 'r'));

In [None]:
plate_corrected = plate_warp[445:2788, 614:3620]
color_plate_corrected = color_plate_warp[445:2788, 614:3620]

fig = plt.figure( figsize = (12, 10) )
# plt.imshow( plate_corrected, cmap = 'gray', vmin = 0, vmax = 255 );
plt.imshow( color_plate_corrected );

## Clean up

In [None]:
del plate
del color_plate_warp

In [None]:
print( color_plate_corrected.dtype, plate_corrected.dtype )

color_plate_corrected = (255 * color_plate_corrected).astype( np.uint8 )

print( color_plate_corrected.dtype, plate_corrected.dtype )

# Extract boxes with data

The data to be extracted is plotted in three graph boxes. 

Annoyingly, the graphs have a grid, making our job harder.  We will use Gaussian filters again to destroy the grid so that the graph boxes are uniform coupled with removal of small holes and objects.

We will then use contours to identify the 3 graph boxes. 


## Remove grid lines from graph boxes


In [None]:
zoom_factor = 2
x = 450
y = 650

fig = plt.figure(figsize = (12, 8))
ax1 = fig.add_subplot(121)

zoomed_image, x0, y0 = grayscale_zoom( plate_corrected, x, y, zoom_factor )
ax1.imshow( zoomed_image, cmap = 'gray')

ax2 = fig.add_subplot(122)

sigma = 10
img2 = gaussian( plate_corrected, sigma = (sigma, sigma), 
                 truncate = 3.5, preserve_range = True )

plate_for_boxes = img2 > threshold_otsu(img2)
print(f"The array plate_for_boxes is of type {plate_for_boxes.dtype}.\n")

zoomed_image, x0, y0 = grayscale_zoom( plate_for_boxes, x, y, zoom_factor )
ax2.imshow( zoomed_image, cmap = 'gray' );

plt.tight_layout()

del zoomed_image

<br>

And now we can take out some of the smallish holes and objects...

In [None]:
plate_for_boxes = remove_small_objects( plate_for_boxes, 2000 )
plate_for_boxes = remove_small_holes( plate_for_boxes, 2000 )


fig = plt.figure( figsize = (12, 10) )
plt.imshow(plate_for_boxes, cmap = 'gray');

Pretty cool, don't you think?

## Contours

We can now identify contours and eliminate all that are small.


In [None]:
contours = find_contours(plate_for_boxes)
print(f"The algorithm found {len(contours)} contours.\n")

for j in range(len(contours)-1, -1, -1):
    if len(contours[j]) < 5000:
        contours.pop(j)

print(f"There are {len(contours)} good contours.\n" )


In [None]:
fig = plt.figure( figsize = (12, 10) )
ax = fig.add_subplot(111)

ax.imshow(plate_for_boxes, cmap = 'gray')

# Find coordinates of corners of boxes
#
box_max = []
box_min = []
for n, contour in enumerate(contours):
    ax.plot(contour[:, 1], contour[:, 0], linewidth = 2)
    box_max.append( np.max(contour, axis = 0) )
    box_min.append( np.min(contour, axis = 0) )
    
del contours

We now store the sections of the image with the graph boxes and with the corresponding text into list.

In [None]:
graph_boxes = []
text_boxes = []
for i in range(3):
    temp = color_plate_corrected[int(box_min[i][0]):int(box_max[i][0]), 
                                 int(box_min[i][1]):int(box_max[i][1]), :]
    graph_boxes.append( temp )
    
    temp = plate_corrected[int(box_min[i][0]):int(box_max[i][0])+50, 
                           :int(box_min[i][1])]
    text_boxes.append( temp )
    
text_boxes.append( plate_corrected[int(box_max[2][0])+50:, :] )

print(f"The array in graph_boxes is of type {graph_boxes[0].dtype}.\n")
print(f"The array in text_boxes is of type {text_boxes[0].dtype}.\n")

In [None]:
del plate_for_boxes
del plate_corrected
del color_plate_corrected

In [None]:
fig = plt.figure( figsize = (10, 10) )
gs = fig.add_gridspec(4, 5)
ax = []

for i in range(3):
    ax.append( fig.add_subplot( gs[i, 0] ) )
    ax[-1].imshow( text_boxes[i], cmap = 'gray', vmin = 0, vmax = 255 )
    
    ax.append( fig.add_subplot( gs[i, 1:] ) )
    ax[-1].imshow( graph_boxes[i] )
    

ax.append( fig.add_subplot( gs[3, :] ) )
ax[-1].imshow( text_boxes[3], cmap = 'gray', vmin = 0, vmax = 255 )

plt.tight_layout()


## Save graph boxes

In [None]:
for i in range(3):
    imsave(results_folder / f"graph_boxes_{i}.png", graph_boxes[i])
    

**Since the morphology functions we introduced earlier assume a bright foreground and a dark background, we will invert our text boxes images too**.

In [None]:
print(text_boxes[0][:10,:10])

new_boxes = []
for box in text_boxes:
    new_boxes.append( (255 - box).astype( np.uint8 ) )

new_boxes[0]

text_boxes = new_boxes

In [None]:
fig = plt.figure( figsize = (12, 7) )
gs = fig.add_gridspec(2, 3)
ax = []

for i in range(3):
    ax.append( fig.add_subplot( gs[0, i] ) )
    ax[-1].imshow( text_boxes[i], cmap = 'gray' )
    

ax.append( fig.add_subplot( gs[1, :] ) )
ax[-1].imshow( text_boxes[3], cmap = 'gray' )

plt.tight_layout()

Clearly, the first box is the less noisy. We will focus on it first.

# Clean sections of image with text 

We will use optical character recognition (OCR) to extract the text in the images.  `Tesseract` is a package that can be used for this purpose.  However, for it to work appropriately, the input image must have noise minimized.

## Useful plotting function

In [None]:
zoom_factor = 2
x_c = 300
y_c = 300

y_lim = 600
x_lim = 500

def plot_sections(my_image, zoom_factor, x_c, y_c, x_lim, y_lim):
    
    fig = plt.figure( figsize = (10, 8) )
    gs = fig.add_gridspec(1, 3)
    ax = []

    ax.append( fig.add_subplot( gs[0, 0]) )
    ax[-1].imshow( my_image[:y_lim,:x_lim], cmap = 'gray');

    ax.append( fig.add_subplot(gs[0, 1:]) )
    zoomed_image, x0, y0 = grayscale_zoom( my_image, x_c, y_c, zoom_factor )
    ax[-1].imshow( zoomed_image, cmap = 'gray')

    plt.tight_layout()
    


## Denoising

We start by binarizing the image.

In [None]:
h, w = text_boxes[0].shape

print(threshold_otsu( text_boxes[0] ))
binary_mask = text_boxes[0] > threshold_otsu( text_boxes[0] )

plot_sections(binary_mask, zoom_factor, x_c, y_c, x_lim, y_lim)

Clearly there are this white columns breaking the background at regular places. Maybe we can identify them and get rid of them systematically.

Let's look at the average intensity by column.

In [None]:
column_average = binary_mask.mean( axis = 0 )

plt.plot(100*column_average);

Beautiful!!! We can see that wherever the average in greater than 80%, this is a spurious white column.

We can set all of them to zero! 

In [None]:
for i in range(w):
    if column_average[i] > 0.4:
        binary_mask[:,i] = 0
        
plot_sections(binary_mask, zoom_factor, x_c, y_c, x_lim, y_lim)

Next, we use `remove_small_objects` to get rid of some left over noise.


In [None]:
binary_clean_objects = remove_small_objects( binary_mask, 50 )

plot_sections(binary_clean_objects, zoom_factor, x_c, y_c, x_lim, y_lim)

Looking good!

We can now use `binary_dilation` to connect the segments in the text.

In [None]:
vicinity = disk(4)
clean_binary = binary_closing( binary_clean_objects, vicinity )

plot_sections( clean_binary, zoom_factor, x_c, y_c, x_lim, y_lim)

# Re-factoring code for cleaning text_boxes

We saw that the first step is binarizing, followed by removal of lines, removal of small objects, and then binary_closing. The final step is conversion to white background and black foreground.

In [None]:
def clean_for_ocr( my_image, column_threshold, object_threshold, radius):
    """
    
    """
    h, w = my_image.shape

    fig = plt.figure( figsize = (10, 8))
    ax = []
    # Binarize
    #
    print( f"The threshold recommended from Otsu algorithm is "
           f"{threshold_otsu( text_boxes[0] )}.\n" )
    binary_mask = my_image > threshold_otsu( my_image )
    
    # Remove lines
    #
    ax.append(fig.add_subplot(311))
    column_average = binary_mask.mean( axis = 0 )
    ax[-1].plot( column_average )
    ax[-1].hlines( [0.2, 0.4, 0.6, 0.8], 0, w, color = '0.2' )
    for i in range(w):
        if column_average[i] > column_threshold:
            binary_mask[:,i] = 0
            
    # Remove small objects
    #
    ax.append(fig.add_subplot(312))
    binary_clean_objects = remove_small_objects( binary_mask, 
                                                 object_threshold )  
    
    ax[-1].imshow( binary_clean_objects, cmap = 'gray')
    
    # Use binary closing to connect font elements
    #
    ax.append(fig.add_subplot(313))
    vicinity = disk(radius)
    clean_binary = binary_closing( binary_clean_objects, vicinity )
    
    ax[-1].imshow( clean_binary, cmap = 'gray')
    
    plate_for_ocr = (255 - clean_binary).astype( np.uint8 )

    return plate_for_ocr


In [None]:
plate_for_ocr = clean_for_ocr( text_boxes[2], 0.6, 40, 4 )


In [None]:
fig = plt.figure( figsize = (10, 8) )
plt.imshow(plate_for_ocr, cmap = 'gray');

# OCR

Using `Tesseract` we can see that the string extraction is actually pretty terrible because the font does not seem to be recognized well even though the image is quite noiseless.


In [None]:
plate_for_ocr = (255 - clean_binary).astype( np.uint8 )

results = pytesseract.image_to_data( plate_for_ocr, 
                                     output_type = pytesseract.Output.DICT )


In [None]:
fig = plt.figure( figsize = (15, 10))
ax = fig.add_subplot(111)

ax.imshow(plate_for_ocr, cmap = 'gray')
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]

    w = results['width'][i]
    h = results['height'][i]

    text = results['text'][i]
    conf = int(results['conf'][i])
    
    if conf > 0: 
        ax.hlines([y, y+h], x, x+w, color = 'g')
        ax.vlines([x, x+w], y, y+h, color = 'g')
        ax.text(x, y-10, f"text" )
    