# Image Processor

This is the image processor. In extracts the text of a given image.<br/>
To increase the efficiency, it uses OpenCv's EAST (Efficient and Accurate Scene Text Detector) to focus the OCR (optical character recognition) just on the text. <br/>


In [14]:
from imutils.object_detection import non_max_suppression
import numpy as np
import time
import cv2
import os
import pytesseract
from google.cloud import vision
from PIL import Image, ImageEnhance, ImageFilter
from enum import Enum

In [15]:
class OCR(Enum):
    TESSERACT = 1
    GOOGLE_CLOUD_VISION = 2

In [16]:
min_confidence = 0.75
input_image = "data/Raw/2.jpg"
default_height = 320
default_width = 320
default_padding = 0.15
ocr = OCR.GOOGLE_CLOUD_VISION

In [3]:
def decode_predictions(scores, geometry):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < min_confidence:
                continue

            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences)

In [4]:
def change_contrast_and_brightness(image, alpha=1.0, beta=0):
    for y in range(image.shape[0]):
        for x in range(image.shape[1]):
            image[y,x] = np.clip(alpha*image[y,x] + beta, 0, 255)
    return image

In [10]:
def enhance_image(image):
    #image = cv2.fastNlMeansDenoisingColored(image,None,5,10,7,21)
    image = cv2.resize(image, (300, 300))
    #image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = change_contrast_and_brightness(image)
    #image = cv2.blur(image,(15,15))
    #image = cv2.fastNlMeansDenoisingColored(image,None,5,10,7,21)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #(thresh, image) = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
    #image = cv2.medianBlur(image,5)
    image = cv2.GaussianBlur(image,(5,5),0)
    #ret3,image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    cv2.imshow("Text Detection", image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return image

In [6]:
def extract_text_tesseract(image):
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract'
    TESSDATA_PREFIX = 'C:/Program Files/Tesseract-OCR'

    #pytesseract.image_to_string(rotated, lang='eng', config='--psm 11 --oem 0 -c tessedit_char_whitelist=0123456789')
    #config = ("-l eng --oem 1 --psm 7")

    image = enhance_image(image)

    config = ("-l eng --psm 11 --oem 0 -c tessedit_char_whitelist=0123456789")
    return pytesseract.image_to_string(image, config=config)

In [7]:
def extract_text_gcv(image):
    client = vision.ImageAnnotatorClient()

    cv2.imwrite("tmp.jpg", image)
    
    with open("tmp.jpg", 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)
    response = client.text_detection(image=image)
    
    os.remove("tmp.jpg")
    
    if (len(response.text_annotations) == 0):
        return None
    
    return ('_'.join([d.description for d in response.text_annotations[1:]]))
    #return response.text_annotations[0].description

In [18]:
# load the input image and grab the image dimensions
image = cv2.imread(input_image)
orig = image.copy()
(origH, origW) = image.shape[:2]

# set the new width and height and then determine the ratio in change
# for both the width and height
(newW, newH) = (default_width, default_height)
rW = origW / float(newW)
rH = origH / float(newH)

# resize the image and grab the new image dimensions
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]

# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]

# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
net = cv2.dnn.readNet("frozen_east_text_detection.pb")

# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H), (123.68, 116.78, 103.94), swapRB=True, crop=False)
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)

# decode the predictions, then  apply non-maxima suppression to
# suppress weak, overlapping bounding boxes
(rects, confidences) = decode_predictions(scores, geometry)
boxes = non_max_suppression(np.array(rects), probs=confidences)

# initialize the list of results
results = []

print("[INFO] processing text boxes...")
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
    # scale the bounding box coordinates based on the respective
    # ratios
    startX = int(startX * rW)
    startY = int(startY * rH)
    endX = int(endX * rW)
    endY = int(endY * rH)

    # in order to obtain a better OCR of the text we can potentially
    # apply a bit of padding surrounding the bounding box -- here we
    # are computing the deltas in both the x and y directions
    dX = int((endX - startX) * default_padding)
    dY = int((endY - startY) * default_padding)

    # apply padding to each side of the bounding box, respectively
    startX = max(0, startX - dX)
    startY = max(0, startY - dY)
    endX = min(origW, endX + (dX * 2))
    endY = min(origH, endY + (dY * 2))

    # extract the actual padded ROI
    roi = orig[startY:endY, startX:endX]
    
    # in order to apply Tesseract v4 to OCR text we must supply
    # (1) a language, (2) an OEM flag of 4, indicating that the we
    # wish to use the LSTM neural net model for OCR, and finally
    # (3) an OEM value, in this case, 7 which implies that we are
    # treating the ROI as a single line of text
    
    if(ocr == OCR.TESSERACT):
        text = extract_text_tesseract(roi)
    elif(ocr == OCR.GOOGLE_CLOUD_VISION):
        text = extract_text_gcv(roi)
    
    # add the bounding box coordinates and OCR'd text to the list
    # of results
    if(text != None):
        results.append(((startX, startY, endX, endY), text))

# sort the results bounding box coordinates from top to bottom
results = sorted(results, key=lambda r:r[0][1])

# loop over the results
for ((startX, startY, endX, endY), text) in results:
    # strip out non-ASCII text so we can draw the text on the image
    # using OpenCV, then draw the text and a bounding box surrounding
    # the text region of the input image
    text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
    output = orig.copy()
    cv2.rectangle(output, (startX, startY), (endX, endY), (0, 0, 255), 2)
    cv2.putText(output, text, (startX, startY - 20), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)

    # show the output image
    cv2.imshow("Text Detection", output)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
print("[INFO] Done")

[INFO] loading EAST text detector...
[INFO] processing text boxes...
[INFO] Done
