In [117]:
scanned_form_image_template_path = "data/ScannedForm_For_TextExtraction_Template.jpg"

expected_mapped_dictionary_for_text_extract = {
    "Form":"W-9",
    "Name (as shown on your income tax return)":"The Game Changer",
    "Business name, if different from above":"Robot Process Automation Organisation",
    "Individual/Sole proprietor":"False",
    "Corporation":"False",
    "Partnership":"False",
    "Other":"True-Innovation",
    "Exempt from backup withholding":"True",
    "Address (number, street, and apt. or suite no.)":"Best Place To Work For",
    "City, state, and ZIP code":"Dream Land 786",
    "List account number(s) here (optional)":"123456789abcd, abc786786",
    "Request's name and address (optional)":"Amandeep Rukhaya r/o House No-1x2y3z Sector9a0b, Gurugram, Haryana 122003",
    "Social security number":"26789ab0o",
    "Employer identification number":"BPL3145CA",
    "Date":"8/30/05 5 P.M."  
}

In [118]:
%config IPCompleter.greedy=True # To enable intellisense in Jupyter Notebook
# to make matplotlib figures inline (% matplotlib inline), to use (%matplotlib notebook) - to make it interactive as well
%matplotlib notebook

In [119]:
# import neccessary packages

import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pytesseract as pytt
import pprint

In [120]:
# Utility Methods

def show_high_resolution_picture_in_two_parts(image_in_numpy_format, image_name):
    height, width = image_in_numpy_format.shape[:2]
    
    if len(image_in_numpy_format.shape)==3:
        cv2.imshow(image_name+" Part-1", cv2.resize(image_in_numpy_format[:height//2, :, :], (0,0), fx=0.4, fy=0.4)) 
        cv2.waitKey() #This is required for showing opencv images
        #cv2.imshow(image_name+" Part-2", cv2.resize(image_in_numpy_format[height//2:, :, :], (0,0), fx=0.4, fy=0.4)) 
        #cv2.waitKey() #This is required for showing opencv images
    else:
        cv2.imshow(image_name+" Part-1", cv2.resize(image_in_numpy_format[:height//2, :], (0,0), fx=0.4, fy=0.4)) 
        cv2.waitKey() #This is required for showing opencv images
        #cv2.imshow(image_name+" Part-2", cv2.resize(image_in_numpy_format[height//2:, :], (0,0), fx=0.4, fy=0.4)) 
        #cv2.waitKey() #This is required for showing opencv images

    # Drawing the image using matplotlib will give options to zoom-in or zoom-out (so better for analysis)
    # plt.imshow(input_image[:height/2, :width/2, :])

In [121]:
# Read input Form



img = cv2.imread(scanned_form_image_template_path)

## convert to hsv
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

## mask of green (36,0,0) ~ (70, 255,255)
# mask1 = cv2.inRange(hsv, (36, 0, 0), (70, 255,255))
mask1 = cv2.inRange(hsv, (45, 100, 100), (75, 255,255))

## mask o yellow (15,0,0) ~ (36, 255, 255)
mask2 = cv2.inRange(hsv, (15,0,0), (36, 255, 255))

# define range of blue color in HSV
lower_blue = np.array([110,50,50])
upper_blue = np.array([130,255,255])
# Threshold the HSV image to get only blue colors
mask3 = cv2.inRange(hsv, lower_blue, upper_blue)

# define range of red color in HSV
lower_red = np.array([0, 100, 100])
upper_red = np.array([10, 255, 255])
# Threshold the HSV image to get only red colors
mask4 = cv2.inRange(hsv, lower_red, upper_red)

show_high_resolution_picture_in_two_parts(mask1, "Green")
#show_high_resolution_picture_in_two_parts(mask2, "Yellow")
show_high_resolution_picture_in_two_parts(mask3, "Blue")
show_high_resolution_picture_in_two_parts(mask4, "Red")

In [122]:
def find_all_regions_of_interest(mask, mask_name):
    _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

    print("Number of Contours Received for ({}): ".format(mask_name), len(contours))

    my_list = []

    for counter in range(0,len(contours)):
        (x, y, w, h) = cv2.boundingRect(contours[counter])
        if h>20 and w>20:
            my_list.append((x, y, w, h))
            # Thickness should be passed as -1 (or any negative value) for complete fill
            # cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2)
            
            #cv2.imshow(mask_name+" Section", img[y:y+h, x:x+w, :])
            #cv2.waitKey()
        
        # show_high_resolution_picture_in_two_parts(img1, "contours for green")
        
    # [ print(i) for i in my_list]
    return my_list

In [123]:
coordinates_for_label_values = find_all_regions_of_interest(mask1, "GREEN")
coordinates_for_labels = find_all_regions_of_interest(mask4, "RED")
coordinates_of_enclosing_boundary = find_all_regions_of_interest(mask3, "BLUE")

print("Valid Contours Count ({}): ".format("GREEN"), len(coordinates_for_label_values))
print("Valid Contours Count ({}): ".format("RED"), len(coordinates_for_labels))
print("Valid Contours Count ({}): ".format("BLUE"), len(coordinates_of_enclosing_boundary))


Number of Contours Received for (GREEN):  3
Number of Contours Received for (RED):  7
Number of Contours Received for (BLUE):  131
Valid Contours Count (GREEN):  3
Valid Contours Count (RED):  3
Valid Contours Count (BLUE):  3


In [124]:
def return_extreme_x_and_y(points_tuple):
    
    #print("What's Going On: ", sep="")
    #print(points_tuple)
    
    left_top_x_boundary = points_tuple[0]
    left_top_y_boundary = points_tuple[1]
            
    right_bottom_x_boundary = left_top_x_boundary + points_tuple[2]
    right_bottom_y_boundary = left_top_y_boundary + points_tuple[3]
    
    # print("Returning Values: ", (left_top_x_boundary, left_top_y_boundary, right_bottom_x_boundary, right_bottom_y_boundary))
    
    return (left_top_x_boundary, left_top_y_boundary, right_bottom_x_boundary, right_bottom_y_boundary)

In [125]:

coordinates_for_labels_with_values = []

for labels in coordinates_for_labels:
    for values in coordinates_for_label_values:
        for boundaries in coordinates_of_enclosing_boundary:
            xb1,yb1,xb2,yb2 = return_extreme_x_and_y(boundaries)
            new_values = xv1,yv1,xv2,yv2 = return_extreme_x_and_y(values)
            new_labels = xl1,yl1,xl2,yl2 = return_extreme_x_and_y(labels)
        
            is_value_valid = xb1<xv1<xb2 and yb1<yv1<yb2 and xb1<xv2<xb2 and yb1<yv2<yb2
            is_label_valid = xb1<xl1<xb2 and yb1<yl1<yb2 and xb1<xl2<xb2 and yb1<yl2<yb2

            if is_value_valid and is_label_valid:
                coordinates_for_labels_with_values.append((new_labels, new_values))
                
                

# print("######## Values with labels:: ", sep="")
# print(len(coordinates_for_labels_with_values))

In [139]:

my_mapped_dictionary = {}

for index, tuple_of_tuples in enumerate(coordinates_for_labels_with_values):
    labels,values = tuple_of_tuples
    xl1, yl1, xl2, yl2 = labels
    xv1, yv1, xv2, yv2 = values
    # print(xl1, yl1, xl2, yl2)
    # print(xv1, yv1, xv2, yv2)
    cv2.imshow("Field Number [Labels]: ("+str(index) + ")", img[yl1:yl2, xl1:xl2, :])
    cv2.waitKey()
    cv2.imshow("Field Number [Value]: ("+str(index) + ")", img[yv1:yv2, xv1:xv2, :])
    cv2.waitKey()
    my_mapped_dictionary[index] = (img[yl1+3:yl2-3, xl1+3:xl2-3, :], img[yv1+3:yv2-3, xv1+3:xv2-3, :])




In [140]:
my_mapped_dictionary_for_text_using_tesseract_without_any_preprocessing = {}

for key,value in my_mapped_dictionary.items():
    label_str = pytt.image_to_string(value[0])
    value_str = pytt.image_to_string(value[1])
    my_mapped_dictionary_for_text_using_tesseract_without_any_preprocessing[label_str] = value_str


In [141]:
pprint.pprint(my_mapped_dictionary_for_text_using_tesseract_without_any_preprocessing)

{'Form': '',
 'Name (as shown on your income tax return)': 'The Game Changer',
 'uusmess name, II Gmerem Tl’Om above': 'Robot Process Automation Organisation'}


In [138]:
matched_successfully = my_mapped_dictionary_for_text_using_tesseract_without_any_preprocessing.items() & expected_mapped_dictionary_for_text_extract.items()

print("Number of Labels matched with values[Without preprocessing]:: ", len(matched_successfully))

pprint.pprint(matched_successfully)

Number of Labels matched with values[Without preprocessing]::  1
{('Name (as shown on your income tax return)', 'The Game Changer')}


In [102]:
my_mapped_dictionary_for_text_using_tesseract_after_preprocessing = {}
size_multiplier = 2
threshold_value=127
max_value=255
counter = 1
thresholding_style=cv2.THRESH_BINARY #+ cv2.THRESH_TRIANGLE
interpolation_for_resize=cv2.INTER_LINEAR
save_images = True


def get_counter_value():
    global counter
    temp = counter
    counter+=1
    return "temp/"+str(temp).zfill(3)+". "

def preprocess_the_image(region_of_interest):
    #print("@@@@@@@@@: ",type(region_of_interest))
    if save_images:
        cv2.imwrite(get_counter_value()+"BeforecvtColor.png", region_of_interest)
    region_of_interest = cv2.cvtColor(region_of_interest, cv2.COLOR_BGR2GRAY)
    if save_images:
        cv2.imwrite(get_counter_value()+"AftercvtColor.png", region_of_interest)
    # region_of_interest = cv2.resize(region_of_interest, None, fx=size_multiplier, fy=size_multiplier, interpolation=interpolation_for_resize)
    if save_images:
        cv2.imwrite(get_counter_value()+"AfterResize.png", region_of_interest)
    _, region_of_interest = cv2.threshold(region_of_interest, threshold_value, max_value, thresholding_style )
    # region_of_interest = cv2.adaptiveThreshold(region_of_interest,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    if save_images:
        cv2.imwrite(get_counter_value()+"AfterThreshold.png", region_of_interest)
    return region_of_interest

for key,value in my_mapped_dictionary.items():
    preprocessed_label = preprocess_the_image(value[0])
    preprocessed_value = preprocess_the_image(value[1])
    #print("#########: ",type(preprocessed_label))
    #print(preprocessed_label)
    label_str = pytt.image_to_string(preprocessed_label)
    value_str = pytt.image_to_string(preprocessed_value)
    my_mapped_dictionary_for_text_using_tesseract_after_preprocessing[label_str] = value_str
    


In [96]:
pprint.pprint(my_mapped_dictionary_for_text_using_tesseract_after_preprocessing)

{'Business name. if different from above': 'Robot Process Automation '
                                           'Organisation',
 'City, state, and ZIP code': 'Dream Land 786',
 'Corporation': '',
 'Date': '',
 'Employer identification number': 'BIPLL|3|1I4|51C|A',
 'Exempt from backup\nwithholding': '',
 'Form': 'W-9',
 'HGGreSS (numoer, street. ana aDI. OI" SUITE no.)': 'Best Place To Work For',
 'List account number(s) here (optional)': '123456789abcd, abc786786',
 'Name (as shown on VOUI’ Income tax return)': 'The Game Changer',
 'Other': 'Innovation',
 'Partnership:': '',
 'Reauester’s name and address (ootional)': 'Amandeep Rukhaya r/o House\n'
                                            'No-1x2y32 Sector-9a0b, Gurugram,\n'
                                            'Haryana 122003',
 'Somal security number': '2|6|7¢3|9¢aybioto',
 'lndividual/\nSole proprietor': ''}


In [101]:
matched_successfully = my_mapped_dictionary_for_text_using_tesseract_after_preprocessing.items() & expected_mapped_dictionary_for_text_extract.items()

print("Number of Labels matched with values[after preprocessing]:: ", len(matched_successfully))

pprint.pprint(matched_successfully)

Number of Labels matched with values[after preprocessing]::  3
{('City, state, and ZIP code', 'Dream Land 786'),
 ('Form', 'W-9'),
 ('List account number(s) here (optional)', '123456789abcd, abc786786')}
