# Creating the Streamlit Pipeline

This notebook is really just to help me get the previous notebooks functions into a streamlined document processor. Once finished, I can take the pipeline and put it in Streamlit for "drag-drop" testing.

In [13]:
%%capture
%pip install PyMuPDF
%pip install pyocr
%pip install torch torchvision torchaudio
%pip install easyocr

# I will need to use both PIL and cv2 for different models/reasons. I like them both =] 
from PIL import Image
from PIL import ImageChops
import cv2

# Gotta have these for working with the numbers
import numpy as np
import pandas as pd

# Working with the pdfs (might not use them all)
import glob, sys, fitz

import pyocr
import pyocr.builders  
tools = pyocr.get_available_tools()
tool = tools[0]

import easyocr
reader = easyocr.Reader(['en'], gpu = False)

import time

In [12]:
def remove_lines(form_image):
    # Thresholding the image
    (thresh, img_bin) = cv2.threshold(form_image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
    # Invert the image
    img_bin = 255-img_bin
    
    # Defining a kernel length
    kernel_length = np.array(img_bin).shape[1] // 80

    # Verticle kernel of (1 X kernel_length)
    # > detects all the verticle lines from the image.

    # Horizontal kernel of (kernel_length X 1)
    # > detects all the horizontal line from the image.
    verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))

    # A kernel of (3 X 3) ones.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

    # Morphological operation to detect vertical lines from an image
    img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations= 3 )
    verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations = 3)

    # Morphological operation to detect horizontal lines from an image
    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations = 3)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)

    # Weighting parameters, this will decide the quantity of an image to be added to make a new image.
    alpha = 0.5
    beta = 1.0 - alpha
    # This function helps to add two image with specific weight 
    # parameter to get a third image as summation of two image.
    img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=2)
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    invert = cv2.bitwise_not(img_final_bin)
    dst = cv2.addWeighted(form_image, 1, invert, 1, 0)
    
    return dst

def find_word(img, word):
    # Convert to PIL
    im_pil = Image.fromarray(img)
    # Get all of the words and their respective locations
    word_boxes = tool.image_to_string(im_pil, lang="eng", builder=pyocr.builders.WordBoxBuilder())
    # Loop and find the word I'm looking for.
    for box in word_boxes:
        if box.content == word:
            return box.position
    return None


def crop_periods_of_service(img):
    
    # Get width and height for crop
    height, width = img.shape

    # Finding the Period of Service start point
    location = find_word(img, "PERIODS")
    
    if location:
        (left, top), (right, bottom) = location  

        font_height = bottom - top
        font_width = right - left
        
        finder = img[top+int(font_height*7):top+int(font_height*69), int(left+font_width//2-10):width]
        
        return finder
    
    else:
        return None
    
'''
Wrappers for predicition
'''

def text_from_pyocr(img):  
    # Convert to PIL
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    except:
        pass
    im_pil = Image.fromarray(img)
    text = tool.image_to_string(im_pil, lang='eng', builder=pyocr.builders.TextBuilder())
    return text 

def number_from_pyocr(img):
    #Convert to PIL
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    except:
        pass
    im_pil = Image.fromarray(img)
    text = tool.image_to_string(im_pil, lang='eng', builder=pyocr.builders.DigitBuilder())
    return text 

def text_from_easyocr(img):
    try:
        t = reader.readtext(img, paragraph="False")
        text = t[0][1]
    except Exception as e:
        text = ''
    return text

'''
Utility for the cell detection below
'''

def image_is_blank(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    im_pil = Image.fromarray(img)
    
    # Filter out noise and check if the image is blank
    image_file = im_pil.point( lambda p: 255 if p > 200 else 0 )
    if ImageChops.invert(image_file).getbbox():
        return False
    else:
        return True
    
def is_checked(img):
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    except Exception as e:
        pass
    gray = 255*(img < 128).astype(np.uint8) # To invert the text to white
    coords = cv2.findNonZero(gray) # Find all non-zero points (text)
    x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
    rect = img[y:y+h, x:x+w] # Crop the image - note we do this on the original image
    x2 = w//2
    y2 = h//2
    center = rect[y2:int(y2+h*0.3), x2:int(x2+w*0.3)] # Crop the image - note we do this on the original image
    number_of_black_pix = np.sum(center == 0) 
    return number_of_black_pix >= 2


def remove_margins(img):
    try:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    except Exception as e:
        pass
    gray = 255*(img < 128).astype(np.uint8) # To invert the text to white
    coords = cv2.findNonZero(gray) # Find all non-zero points (text)
    x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
    rect = img[y-1:y+h+1, x-1:x+w+1] # Crop the image - note we do this on the original image
    
    rect = cv2.copyMakeBorder(rect, 25, 25, 25, 25, cv2.BORDER_CONSTANT, None, value = 255)
    
    return rect
    
'''
The following block is a modification of the work found here on GitHub.

>> https://gist.github.com/huks0/e48d604fc9dd91731bc687d6e3933db4

The author created a quick and efficient script for breaking images up into dataframe-like squares.
for which you can then parse into an array. 
Parts of it work really well, but I had to make some changes to it for it work better for me.
Also, the author had their's very "script" like, where I will need more function-like writing.
'''    

def sort_contours(cnts, method="left-to-right"):
    # initialize the reverse flag and sort index
    reverse = False
    i = 0
    # handle if we need to sort in reverse
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    # handle if we are sorting against the y-coordinate rather than
    # the x-coordinate of the bounding box
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1
    # construct the list of bounding boxes and sort them from top to
    # bottom
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
    key=lambda b:b[1][i], reverse=reverse))
    # return the list of sorted contours and bounding boxes
    return (cnts, boundingBoxes)

def locate_cells(img):
    # Thresholding the image to a binary image
    thresh,img_bin = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Inverting the image 
    img_bin = 255-img_bin
#     cv2.imwrite('cv_inverted.png',img_bin)

    # countcol(width) of kernel as 100th of total width
    kernel_len = np.array(img).shape[1]//100

    # Defining a vertical kernel to detect all vertical lines of image 
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))

    # Defining a horizontal kernel to detect all horizontal lines of image
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))

    # A kernel of 2x2
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    #Use vertical kernel to detect and save the vertical lines in a jpg
    image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)
    
    
#     cv2.imwrite("vertical.jpg",vertical_lines)

    #Use horizontal kernel to detect and save the horizontal lines in a jpg
    image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)
    
    
#     cv2.imwrite("horizontal.jpg",horizontal_lines)


    # Combine horizontal and vertical lines in a new third image, with both having same weight.
    img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)

    #Eroding and thesholding the image
    img_vh = cv2.erode(~img_vh, kernel, iterations=2)
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    
    
    cv2.imwrite("img_vh.jpg", img_vh)


    bitxor = cv2.bitwise_xor(img,img_vh)
    bitnot = cv2.bitwise_not(bitxor)
    
    cv2.imwrite("bitnot.jpg", bitnot)

    # Detect contours for following box detection
    contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Sort all the contours by top to bottom.
    contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")


    #Creating a list of heights for all detected boxes
    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]

    #Get mean of heights
    mean = np.mean(heights)

    #Create list box to store all boxes in  
    box = []
    # Get position (x,y), width and height for every contour and show the contour on image
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w<1000 and h<500):
            image = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
            box.append([x,y,w,h])

    #Creating two lists to define row and column in which cell is located
    row=[]
    column=[]
    j=0

    #Sorting the boxes to their respective row and column
    for i in range(len(box)):    

        if(i==0):
            column.append(box[i])
            previous=box[i]    

        else:
            if(box[i][1]<=previous[1]+mean/2):
                column.append(box[i])
                previous=box[i]            

                if(i==len(box)-1):
                    row.append(column)        

            else:
                row.append(column)
                column=[]
                previous = box[i]
                column.append(box[i])

    #calculating maximum number of cells
    countcol = 0
    for i in range(len(row)):
        countcol = len(row[i])
        if countcol > countcol:
            countcol = countcol

    #Retrieving the center of each column
    center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]

    center=np.array(center)
    center.sort()
    #Regarding the distance to the columns center, the boxes are arranged in respective order

    finalboxes = []
    for i in range(len(row)):
        lis=[]
        for k in range(countcol):
            lis.append([])
        for j in range(len(row[i])):
            diff = abs(center-(row[i][j][0]+row[i][j][2]/4))
            minimum = min(diff)
            indexing = list(diff).index(minimum)
            lis[indexing].append(row[i][j])
        finalboxes.append(lis)
    
    countrow = len(row)
    
    return finalboxes, bitnot, countrow, countcol


def analyze_cells(img1, finalboxes, bitnot):
    #from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
    outer=[]
    # Row iteration     
    for i in range(len(finalboxes)):
        # Column iteration
        for j in range(len(finalboxes[i])):
            inner=''
            if(len(finalboxes[i][j])==0):
                outer.append(' ')
            else:
                for k in range(len(finalboxes[i][j])):

                    y,x,w,h = finalboxes[i][j][k][0],finalboxes[i][j][k][1], finalboxes[i][j][k][2],finalboxes[i][j][k][3]

                    # Skip non-essential rows
                    if j > 0:
                        
                        finalimg1 = img1[x:x+h, y:y+w]

                        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                            
                        # Create a white border around the image.                 
                        border = cv2.copyMakeBorder(finalimg1, 2,2,2,2, cv2.BORDER_CONSTANT, value=[255, 255])
                        resizing = cv2.resize(border, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
                        dilation = cv2.dilate(resizing, kernel, iterations=1)
                        erosion = cv2.erode(dilation, kernel, iterations=2) 
                        demargin = remove_margins(erosion)
                            
                        if not image_is_blank(demargin):
                            
#                             cv2.imwrite(f'cells/c{i}-{j}.png', demargin)

                            if j in [2, 3, 4, 5, 6, 7]:

#                                 cv2.imwrite(f"checks/c{i, j}.png", finalimg1)

                                if is_checked(finalimg1):
                                    out = 'X'
                                else:
                                    out = ' '

                            elif j in [8, 9, 10, 11, 12, 13, 14, 15]:

                                # Try PyOCR
#                                 out = text_from_pyocr(demargin)
                                out = number_from_pyocr(demargin)
                                if out == "":
                                    out = text_from_easyocr(demargin)

                            elif j == 1 or j == 16: 
                                out = text_from_easyocr(demargin)

                            inner = inner +" "+ out
                
                        else:
                            inner = inner +" "
                    
                    
                    
                outer.append(inner)
        
    return np.array(outer)

In [18]:
start = time.time()

# 1. Recieve the pdf and convert it into an image.
# >>> I should consider finding a better way to do this

# To get better resolution
zoom_x = 2.0  # horizontal zoom
zoom_y = 2.0  # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

doc = fitz.open('form.pdf')  # open document

for page in doc:
    pix = page.get_pixmap(matrix=mat)  # render page to an image
    pix.save("form-page-%i.png" % page.number)  # store image as a PNG

end = time.time()
print("pdf to image: ", end - start)


# 2. Process the image and crop the Periods of Service data.

# Save this as the original for cropping the cells
original_form = cv2.imread('form-page-0.png', 0)
# Use this version to remove lines and createa line-less copy
img = cv2.imread('form-page-0.png', 0)


start = time.time()

# Line-less copy
image_without_borders = remove_lines(img)
# cv2.imwrite("image_without_borders.png", image_without_borders)

periods_without_borders = crop_periods_of_service(image_without_borders) 
# cv2.imwrite("periods_without_borders.png", periods_without_borders)

periods = crop_periods_of_service(original_form) 
# cv2.imwrite("periods.png", periods)


end = time.time()
print("crop to periods: ", end - start)

start = time.time()
# Retrieve the box location, the bitnot, ro
boxes, bitnot, countrow, countcol = locate_cells(periods)
end = time.time()
print("locate cells: ", end - start)

start = time.time()
arr = analyze_cells(periods_without_borders, boxes, bitnot)
end = time.time()
print("cells ocr: ", end - start)


# Dataframe stuff
dataframe = pd.DataFrame(arr.reshape(countrow, countcol))
dataframe.columns = ['Line #', 'SERVICE', 'ENL', 'WO', 'COM', 'PAY', 'AD', 'NONE', 
                     'FROM: YR.', 'FROM: MO.', 'FROM: DAYS', 'TO: YR.', 'TO: MO.', 'TO: DAYS',
                    'POINTS', 'LOST TIME', 'SOURCE DOCUMENT']
dataframe.index += 1
dataframe = dataframe.drop(columns=['Line #'])
print("output: \n")
dataframe

pdf to image:  0.08826494216918945
crop to periods:  2.1427900791168213
locate cells:  0.0169219970703125
cells ocr:  14.262386798858643
output: 



Unnamed: 0,SERVICE,ENL,WO,COM,PAY,AD,NONE,FROM: YR.,FROM: MO.,FROM: DAYS,TO: YR.,TO: MO.,TO: DAYS,POINTS,LOST TIME,SOURCE DOCUMENT
1,DEP,X,,,,,X,2010.0,3.0,19.0,2010.0,09,30.0,,,DD4
2,USAR Active,X,,,X,X,,2010.0,10.0,1.0,2016.0,03,6.0,,,DD 214
3,USAR Inactive,X,,,X,,,2016.0,3.0,7.0,2019.0,08,22.0,,,DD4
4,USAR Active,X,,,X,X,,2019.0,8.0,23.0,2020.0,08,1.0,,,DD 214
5,USAR Inactive,X,,,X,,,2020.0,8.0,2.0,2021.0,04,27.0,,,DD4
6,ARMY Active Duty,X,,,X,X,,2021.0,4.0,28.0,,PRESENT,,,,DD4
7,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,
10,,,,,,,,,,,,,,,,


![Periods](periods.png)

# Summary

The pipeline for streamlit is pretty close now. At least, I can use this workbook for trouble shooting and isolating future problems as they come up. I'll be putting this into a .py and creating a streamlit ui now. After that's been created, I will be making my own "is_checked()" model for evaluating the ENL, WO, COM, PAY, AD, and NONE columns.