# Synopsis

Extracting information from scanned files.

# Words to remember

**warping**

**OCR**

**denoising**

**blurring**

# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from colorama import Back, Fore, Style
from copy import copy, deepcopy
from pathlib import Path


In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pytesseract

from matplotlib.gridspec import GridSpec
from matplotlib.patches import Circle
from pylab import imread, imshow
from scipy.stats import linregress, mode, pearsonr
from skimage import img_as_float, img_as_ubyte
from skimage.color import rgb2gray
from skimage.feature import match_template
from skimage.filters import rank, threshold_otsu, gaussian
from skimage.measure import find_contours
from skimage.morphology import ( disk, binary_dilation, binary_erosion, 
                                 binary_closing, binary_opening, dilation, 
                                 remove_small_holes, remove_small_objects,
                                 flood_fill, )

from skimage.transform import estimate_transform, warp

from module_libraries.my_stats import half_frame
from module_libraries.image_lib import ( grayscale_zoom, 
                                         visualize_tesseract_results,
                                         rescaling_from_OCR_results, 
                                         threshold_for_data_extraction,
                                         cluster_infered_lines,
                                         correct_column_heights,
                                         rescaling_from_scan_results,
                                         infer_grid_lines,
                                         display_all_channels,
                                         )

my_fontsize = 15
data_folder = Path.cwd() / 'Results/'

# Load images

We load all images but select a single one for further analysis.


In [None]:
loaded_images = list( data_folder.glob('*') )
print(f"There are {len(loaded_images)} images in the folder.")
print()
for i in range(len(loaded_images)):
    print(f"{i} - {loaded_images[i].parts[-1]}")


We will start by considering the image from the Mueller 2012 paper. The reason is that this is a clean image that is well aligned and in which the fonts are common and clear.

In [None]:
mueller_plate = imread(loaded_images[3])

print( f"Mueller 2012 image has shape {mueller_plate.shape}, and "
       f"has dtype {mueller_plate.dtype}.\n" )

# Mueller 2012 

## Selecting the channel

We will also want to work with a grayscale version of the image. Since this is already a grayscale image, any channel will work. 

In [None]:
fig = plt.figure( figsize = (8, 8) )
plate = mueller_plate[:,:,1]
plt.imshow( plate, cmap = 'gray' )

plt.show()


And the image is already in a good format.

## Extracting the text

We will use Tessaract to extract the text.

In [None]:
results = pytesseract.image_to_data( plate, 
                                     output_type = pytesseract.Output.DICT )

fig = plt.figure( figsize = (15, 10))

visualize_tesseract_results(plate, results, fig)

plt.show()

As expected, **most of the text is extracted perfectly!**

Let's check how the elements are stored in results so that we can encode them adequately digitally.

In [None]:
print( f"Index\t X \t Y \t W \t H \t conf \t Test" )
print('-'*50)      
      
for i in range(len(results['text'])):
    x = results['left'][i]
    y = results['top'][i]
    w = results['width'][i]
    h = results['height'][i]
    conf = results['conf'][i]
    text = results['text'][i]
    
    if len(text) > 1:
        print(f"{i:>3}  \t{x:>3}\t{y:>3}\t{w:>3}\t{h:>3}\t{conf:>4}%\t- {text} -")

## Identifying fixed points for value calculation

Below, we will try to locate the `x` and `y` coordinates of the data points.  However, we are not really after those coordinates, what we want is the values of the data.

To this end, we will locate the coordinates of a couple of `x` (or `y`) values for  which we can determine the actual value. Then is just a matter of linear re-scaling.

For the **left `y` axis**, the good values are where the text equals **100** and **600**. 

In [None]:
i1 = results['text'].index('100')
i2 = results['text'].index('600')

y1 = int( results['top'][i1] + results['height'][i1] / 2 )
y2 = int( results['top'][i2] + results['height'][i2] / 2 )

print(i1, y1)
print(i2, y2)

Good! Those values seem correct.

For the **`x` axis**, the good values are where the text equals **1880** and **1950**. 

In [None]:
i1 = results['text'].index('1880')
i2 = results['text'].index('1950')

x1 = int( results['left'][i1] + results['width'][i1] / 2 )
x2 = int( results['left'][i2] + results['width'][i2] / 2 )

print(i1, x1)
print(i2, x2)

Good, those seem correct again!

We can integrate this code into a simple re-scaling function for getting data values given `x` and `y` coordinates.

In [None]:
def rescaling_from_OCR_results( x, y, x_values, y_values, results):
    """
    Calculate data values corresponding to coordinates x (or y) using
    the coordinates in image of a pair of x values (or y values) and the 
    OCR box results returned by Tesseract.
    
    inputs:
        x - int or np.array for x coordinate in image of data point(s)
        y - int or np.array for x coordinate in image of data point(s)
        x_values - values of two x-axis positions ( must have been found by
                                                    Tesseract )
        y_values - values of two y-axis positions ( must have been found by
                                                    Tesseract )
        results - Tesseract image_to_data output results
        
    returns:
       tuple of int or np.array for x value of data point(s) and 
                int or np.array for y value of data point(s)
    """
    # Values need to be sorted for the transformation to work 
    # in this form
    #
    x_values.sort()
    y_values.sort()
    
    # Get x and y fixed points
    #
    i1 = results['text'].index(x_values[0])
    i2 = results['text'].index(x_values[1])

    x1 = int( results['left'][i1] + results['width'][i1] / 2 )
    x2 = int( results['left'][i2] + results['width'][i2] / 2 )
    
    i1 = results['text'].index(y_values[0])
    i2 = results['text'].index(y_values[1])

    y1 = int( results['top'][i1] + results['height'][i1] / 2 )
    y2 = int( results['top'][i2] + results['height'][i2] / 2 )
    
    # Define transformation
    #
    x_scale = (int(x_values[1]) - int(x_values[0])) / (x2 - x1)
    y_scale = (int(y_values[1]) - int(y_values[0])) / (y2 - y1)
    
    x_base = int( x_values[1] )
    y_base = int( y_values[1] )
        
    return ( x_base + (x - x2) * x_scale, 
             y_base + (y - y2) * y_scale )


x_values = ['1880', '1950']
y_values = ['100', '600']

# These are just test values
#
x, y = 200, 590

print( rescaling_from_OCR_results( x, y, x_values, y_values, results ) )

## Matching data points

### Using templates

A possible approach involves matching every point in the image to a template.

We must first extract the template by finding a array slice that contains the desired symbol. As an example, we will address the data shown using the empty circles.


In [None]:
template = copy( plate[590:608,113:130] )

plt.imshow(template, cmap = 'gray')

plt.show()


<br>

**You can change the slice coordinates above for obtaining a template for the other symbols.**

In [None]:
match_result = match_template(plate, template)
print(f"The best match has a coefficient of {match_result.max():.3f} "
      f"(the maximum possible value is 1)." )
print(f"The worst match has a coefficient of {match_result.min():.3f}. ")

match_threshold = 0.81
result_mask = match_result < match_threshold 
print(f"The max of result_mask is {result_mask.max():.3f}.")
print(f"The min of result_mask is {result_mask.min():.3f}.")

fig = plt.figure( figsize = (8, 8))
plt.imshow(binary_erosion(result_mask, disk(5)), cmap = 'gray')

plt.show()


In [None]:
print(f"The array with the matching coefficients has shape "
      f"{match_result.shape}.\n")
circles_x = []
circles_y = []
circles_match = []
for i in range(match_result.shape[1]):
    for j in range(match_result.shape[0]):
        if match_result[j,i] > match_threshold:
            circles_x.append(i)
            circles_y.append(j)
            circles_match.append(match_result[j,i])

print("The coordinates of the points with matching coefficients greater "
      f"than {match_threshold} are:\n")
print(circles_x)
print(circles_y)

print("\nThe matching coefficients are:\n")
print(circles_match)

<br>

**With these coordinates in hand, we can now calculate all their actual values.**

In [None]:
x, y = rescaling_from_OCR_results( np.array(circles_x), np.array(circles_y), 
                                   x_values, y_values, results )

plt.plot(x, y, 'ro')

plt.show()

When plotting the data this way, we can see that one of the data points was split into two.  This is something that can be corrected by merging points very close to one another.

### Using contours

We will focus on the inside of the graph $-$ we will set appropriate conditions $-$ and look for contours that have the appropriate size. 

In [None]:
y_low, y_high = 10, 620
x_low, x_high = 100, 720
contour_min, contour_max = 25, 40

contours = find_contours( plate )

fig = plt.figure( figsize = (10, 10) )
plt.imshow( plate, cmap = 'gray' )


good_contours = []
for n, contour in enumerate(contours):
    if len(contour) > contour_min and len(contour) < contour_max:
        if ( contour[0,1] > x_low and contour[0,1] < x_high and
             contour[0,0] > y_low and contour[0,0] < y_high ):
            plt.plot(contour[:, 1], contour[:, 0], linewidth = 2)
            good_contours.append( contour )

plt.show()

<br>

**The center of mass of a contour provides a good estimate of the location of a symbol in the graph.** 

Coupling it with a measure of similarity of the region of the image enclosed by the contour would then enable us to assign data points to data sets by template matching.


In [None]:
k = 0 

x_c = np.mean(good_contours[k][:,1])
y_c = np.mean(good_contours[k][:,0])
print(x_c, y_c)

xr = int( max(good_contours[k][:,1]) ) + 2
xl = int( min(good_contours[k][:,1]) ) - 1
yt = int( max(good_contours[k][:,0]) ) + 2
yb = int( min(good_contours[k][:,0]) ) - 1

print(xr, xl, yt, yb)

symbol_plate = plate[yb:yt, xl:xr]

plt.imshow(symbol_plate, cmap = 'gray')

plt.show()

In [None]:
# Because of the x and y limits, this cell works only for k = 0 and k = 1

fig = plt.figure( figsize = (10, 10) )
plt.imshow(mueller_plate, cmap = 'gray')

plt.plot(x_c, y_c, 'ro', markersize = 5, zorder = 10)
plt.plot(good_contours[k][:, 1], good_contours[k][:, 0], 'r', linewidth = 2)

plt.xlim(600,800)
plt.ylim(100, 0)

plt.show()

# Scanned image

Let's now address a rather more challenging case: the graph box considered in [this notebook](http://localhost:8888/notebooks/Module_Image_Processing/nb_04_Text_and_data_in_images.ipynb)).

Each vertical grid line correspond to the middle point of a one minute block.  Each horizontal grid line correspond to an increment of 5 miles/h. 

We could, in principle, get this information from the OCR of the relevant text box.

We can use that information later for calculating data values from the image coordinates.

In [None]:
screen_plate = imread(loaded_images[0])
print(f"Selected image has shape {screen_plate.shape}.\n")

plt.imshow(screen_plate)

plt.show()

print(f"That fourth channel is just ones:\n{screen_plate[:10, :10, 3]}")

We do not need the fourth channel, so we will get rid of it.

We will also want to work with a grayscale version of the image.  The question is: 

> **Which grayscale version should we use?**

Let's look at each channel separately besides a conversion to grayscale of the color image...

In [None]:
display_all_channels( screen_plate )


Since the bars are green, the **green channel** is the one where the bars are more clearly visible.

From now on, we will focus on this channel.

In [None]:
screen_plate[:,:,1]

As before, we will transform to ubytes in order to save resources.

In [None]:
print( f"Maximum of green channel is {screen_plate[:,:,1].max():.3f}, "
       f" minimum is {screen_plate[:,:,1].min():.3f}\n")

plate_g = (256 * screen_plate[:,:,1]).astype( np.uint8 )

fig = plt.figure( figsize = (12, 10) )
plt.imshow( plate_g, cmap = 'gray')

plt.show()

## Find outline of bars and coordinates of grid lines

For simplicity, we first binarize the image. 

In [None]:
x, y, zoom_factor = 200, 350, 2
threshold_for_white = 0

plate_for_data = plate_g > threshold_for_white

fig = plt.figure( figsize = (6, 6) )
ax = fig.add_subplot(111)
zoomed_image, x0, y0 = grayscale_zoom(ax, plate_for_data, x, y, zoom_factor)

ax.imshow( zoomed_image, cmap = 'gray')
ax.plot([zoom_factor * (x-x0)], [zoom_factor * (y-y0)], 'ro')

plt.show()


We can place the relevant code in a function and automate the plotting.

In [None]:
x, y, zoom_factor = 200, 350, 2
threshold_for_white = 0

fig = plt.figure( figsize = (10, 8) )
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

plate_for_data = threshold_for_data_extraction( screen_plate[:,:,1], ax1, ax2, 
                                                threshold_for_white, 
                                                (x, y), zoom_factor )

plt.tight_layout()
plt.show()
    


<br>

The image above suggest a strategy for finding the coordinates of the data. For each column (i.e., `x` value), we can start from the bottom and keep moving up until we find a `0` (i.e., a black pixel). 

Note that **starting from the bottom** in a column means **starting from a `y` coordinate of `y_max`**, which can be extracted from the array's shape.

We have, however, to remember that the grid lines will extend all the way to the top.


In [None]:
y_max, x_max = plate_for_data.shape
x_min = 0
y_min = 0

print(x_max, y_max)

Get coordinates of horizontal grid lines.

In [None]:
level_lines = []
levels = []
level_line_threshold = 10

print(y_min, y_max)
for i in range(y_min, y_max):
    j = 0
    while plate_for_data[i,j] and j < x_max-1:
        j += 1

    # Check if this is grid line
    #
    if j > level_line_threshold:
        level_lines.append(i)

    levels.append(j)
        
print(level_lines)

<br>

We can generalize the code to handle vertical and horizontal grid lines.  


In [None]:
y_max, x_max = plate_for_data.shape
x_min = 0
y_min = 0

# Use these two lines to adjust how much of the image you visualize
#
# y_max = 600
# y_min = 2000

level_lines, levels = infer_grid_lines( 0, y_min, y_max, x_max, 10, 
                                       plate_for_data )

plt.plot(range(y_min, y_max), levels, 'b-')
plt.plot(level_lines, [200]*len(level_lines), 'r.')

plt.show()

Get heights of bars and placement of vertical grid lines.

In [None]:
y_max, x_max = plate_for_data.shape
x_min = 0
y_min = 0

# Use these two lines to adjust how much of the image you visualize
#
# x_max = 200
# x_min = 2000

time_lines, heights = infer_grid_lines( 1, x_min, x_max, y_max, 450, 
                                        plate_for_data )

plt.plot(range(x_min, x_max), heights, 'b-')
plt.plot(time_lines, [400]*len(time_lines), 'r.')

plt.show()

## Cluster coordinates for determining position of grid lines

We cluster coordinates for level and time grid lines and estimate the factor for mapping `y` coordinate to actual values and `x` coordinate to time.

We first use the function for time grid lines.

In [None]:
block_delta_t, t_deltas, t_mapping = cluster_infered_lines( time_lines ) 

plt.plot(t_deltas, 'bo')
plt.hlines(block_delta_t, 0, len(t_mapping), 'r')

plt.show()

Next, we use these function for speed grid lines.

In [None]:
block_delta_y, y_deltas, y_mapping = cluster_infered_lines( level_lines ) 

plt.plot(y_deltas, 'bo')
plt.hlines(block_delta_y, 0, len(y_mapping), 'r')

plt.show()

In [None]:
print(f"There are {len(y_mapping)} values for y scale.")
print(y_mapping)

When you zoom in on the value bars, you will notice that at the edges of the bars there are intermediate values, even though the signal is supposed to be constant during the one minute block.  Additionally, the vertical grid lines prevent us from extracting the correct value at those positions.

We next correct for that.

In [None]:
y_values, new_heights = correct_column_heights( heights, block_delta_t, 
                                                t_mapping, x_min )

fig = plt.figure( figsize = (20, 4) )
plt.plot(range(x_min, x_max), heights, 'b-')
plt.plot(range(x_min, x_max), new_heights, 'r')
plt.ylim(0, 550)

plt.show()

In [None]:
print(f"There are {len(t_mapping)} values for time events.")
print(t_mapping)
      
print(f"\nThere are {len(y_values)} values for their values.")
print(y_values)

The first value is a mistake. We will drop it.

In [None]:
t_mapping.pop(0)
y_values.pop(0)

## Rescale coordinates

In [None]:
y_scale = [15, 10, 5, 0]
t_scale = list(range(1, 21))

data_values = rescaling_from_scan_results( 0, y_values, y_scale, y_mapping, 
                                           y_max )

## Plot data and compare to image

In [None]:
fig = plt.figure( figsize = (9, 5))
fig.set_facecolor('0.8')
ax = []


ax.append( fig.add_subplot(211) )
half_frame(ax[-1], 'Time [min]', 'Speed\n[miles/h]', font_size = my_fontsize)
ax[-1].set_facecolor('0')

ax[-1].set_ylim(0, 15)
ax[-1].set_yticks(range(0, 16, 5))
ax[-1].set_xlim(0.5, 22)
ax[-1].set_xticks(range(2, 22, 2))

ax[-1].vlines(range(0, 21), 0, 15,  lw = 2, color = '0.6', )
ax[-1].hlines([5, 10, 15], 0, 22,  lw = 2, color = '0.6', )

ax[-1].bar(t_scale, data_values, color = 'g', zorder = 10)

ax.append( fig.add_subplot(212) )
ax[-1].imshow(screen_plate)

plt.tight_layout()
plt.show()

## Check accuracy of extracted data

In [None]:
actual_values = [3, 4, 5, 6, 7, 6, 5, 3, 5, 6, 7, 5,
                 3, 5, 6, 7, 6, 5, 4, 3]

fig = plt.figure( figsize = (6, 4))
fig.set_facecolor('0.8')
ax = fig.add_subplot(111)

half_frame(ax, 'Actual values', 'Estimated values', font_size = my_fontsize)

ax.plot(actual_values, data_values, 'ro')
ax.plot([2, 8], [5, 5])
ax.plot([2, 8], [2, 8])

plt.show()

result = pearsonr(actual_values, data_values)
print(result)