In [1]:
#from vision import VisionApi

In [25]:
# %load vision.py
import base64
import os
import sys
import math
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from difflib import SequenceMatcher

import cv2
import numpy as np
import matplotlib.pyplot as plt

from googleapiclient import discovery 
from googleapiclient import errors
from oauth2client.client import GoogleCredentials

DISCOVERY_URL = 'https://{api}.googleapis.com/$discovery/rest?version={apiVersion}'  # noqa
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='saurabh_google_api_credential.json'


class VisionApi:
    """Construct and use the Google Vision API service.
	----- Usage
	vision = VisionApi()
	vision.detect_text_and_output_cropped_image('language_images/written_word_2_ball.jpg', 'ball', 'correct_new.jpg')
	or
	vision.detect_text_and_output_cropped_image('language_images/written_word_2_ball.jpg', 'ball')
    """
    def __init__(self):
        self.credentials = GoogleCredentials.get_application_default()
        self.service = discovery.build(
            'vision', 
            'v1', 
            credentials=self.credentials,
            discoveryServiceUrl=DISCOVERY_URL
        )

    def detect_text(self, input_filenames, num_retries=3, max_results=6):
        """Uses the Vision API to detect text in the given file.
        """
        images = {}
        for filename in input_filenames:
            print(filename)
            assert(os.path.exists(filename))
            with open(filename, 'rb') as image_file:
                images[filename] = image_file.read()

        batch_request = []
        for filename in images:
            batch_request.append({
                'image': {
                    'content': base64.b64encode(images[filename]).decode('UTF-8')
                },
                'features': [{
                    'type': 'TEXT_DETECTION',  #Tells Vision API that we're making a request to do OCR, else 'LABEL_DETECTION'
                    'maxResults': max_results,
                }]
            })
        request = self.service.images().annotate(body={'requests': batch_request})

        try:
            responses = request.execute(num_retries=num_retries)
            if 'responses' not in responses:
                return {}
            text_response = {}
            for filename, response in zip(images, responses['responses']):
                if 'error' in response:
                    print("API Error for %s: %s" % (
                            filename,
                            response['error']['message']
                            if 'message' in response['error']
                            else ''))
                    continue
                if 'textAnnotations' in response:
                    text_response[filename] = response['textAnnotations']
                else:
                    text_response[filename] = []
            return text_response
        except errors.HttpError as e:
            print("Http Error for %s: %s" % (filename, e))
        except KeyError as e2:
            print("Key error: %s" % e2)

    def detect_text_from_image(self, input_image, num_retries=3, max_results=6):
        """Uses the Vision API to detect text in the given file.
        """
        batch_request = []
        batch_request.append({
            'image': {
                'content': base64.b64encode(input_image).decode('UTF-8')
            },
            'features': [{
                'type': 'TEXT_DETECTION',  #Tells Vision API that we're making a request to do OCR, else 'LABEL_DETECTION'
                'maxResults': max_results,
            }]
        })
        request = self.service.images().annotate(body={'requests': batch_request})

        try:
            responses = request.execute(num_retries=num_retries)
            if 'responses' not in responses:
                return {}
            text_response = []
            for response in responses['responses']:
                if 'error' in response:
                    print("API Error : %s" % (
                            response['error']['message']
                            if 'message' in response['error']
                            else ''))
                    continue
                if 'textAnnotations' in response:
                    text_response.append(response['textAnnotations'])
                else:
                    text_response.append([])
            return text_response
        except errors.HttpError as e:
            print("Http Error for %s: %s" % (filename, e))
        except KeyError as e2:
            print("Key error: %s" % e2)        

    def detect_label(self, input_filenames, num_retries=3, max_results=10):
        """Uses the Vision API to detect text in the given file.
        """
        images = {}
        for filename in input_filenames:
            print(filename)
            assert(os.path.exists(filename))
            with open(filename, 'rb') as image_file:
                images[filename] = image_file.read()

        batch_request = []
        for filename in images:
            batch_request.append({
                'image': {
                    'content': base64.b64encode(images[filename]).decode('UTF-8')
                },
                'features': [{
                    'type': 'LABEL_DETECTION',  #Tells Vision API that we're making a request to do OCR, else 'LABEL_DETECTION'
                    'maxResults': max_results,
                }]
            })
        request = self.service.images().annotate(body={'requests': batch_request})

        try:
            responses = request.execute(num_retries=num_retries)
            if 'responses' not in responses:
                return {}
            label_response = {}
            for filename, response in zip(images, responses['responses']):
                if 'error' in response:
                    print("API Error for %s: %s" % (
                            filename,
                            response['error']['message']
                            if 'message' in response['error']
                            else ''))
                    continue
                if 'labelAnnotations' in response:
                    label_response[filename] = response['labelAnnotations']
                else:
                    label_response[filename] = []
            return label_response
        except errors.HttpError as e:
            print("Http Error for %s: %s" % (filename, e))
        except KeyError as e2:
            print("Key error: %s" % e2)

    def highlight_texts(self, image_filename, text_detection_response, expected_text, output_filename=None):
        """Draws a polygon around the faces, then saves to output_filename.
        Args:
          image: a file containing the image with the faces.
          faces: a list of faces found in the file. This should be in the format
              returned by the Vision API.
          output_filename: the name of the image file to be created, where the
              faces have polygons drawn around them.
        """
        if text_detection_response == [] or text_detection_response == None:
            return
        im = Image.open(image_filename)
        draw = ImageDraw.Draw(im)

        # Zeroth response is just an aggregation of all the responses.
        # So not considering it
        num_text_regions = len(text_detection_response) - 1
        similarity_ratio = [0.0]*num_text_regions
        print('############## ', image_filename)
        for i, text_region in enumerate(text_detection_response[1:]):
            text = text_region['description']
            similarity_ratio[i] = SequenceMatcher(None, expected_text.upper(), text.upper()).ratio()
            print(i, text, expected_text, similarity_ratio)

        best_match_similarity = max(similarity_ratio)
        best_match_index = similarity_ratio.index(best_match_similarity)

        text_region = text_detection_response[best_match_index + 1]
        box = [(v.get('x', 0.0), v.get('y', 0.0)) for v in text_region['boundingPoly']['vertices']]
        print(box)
        if best_match_similarity == 1:
            color = '#00ff00'
        else:
            font_size = 80
            font = ImageFont.truetype("calibri.ttf", font_size)
            text_area = (box[0][0], box[0][1] - font_size - 10)
            draw.text(text_area, expected_text, (0,255,0), font=font)
            color = '#ff0000'
        draw.line(box + [box[0]], width=5, fill=color)
        im.save(output_filename)

    def crop_and_highlight_texts(self, image_filename, text_detection_response, expected_text, output_filename=None):
        """Draws a polygon around the faces, then saves to output_filename.
        Args:
          image: a file containing the image with the faces.
          faces: a list of faces found in the file. This should be in the format
              returned by the Vision API.
          output_filename: the name of the image file to be created, where the
              faces have polygons drawn around them.
        """
        if text_detection_response == [] or text_detection_response == None:
            return
        im = Image.open(image_filename)
        draw = ImageDraw.Draw(im)

        # Zeroth response is just an aggregation of all the responses.
        # So not considering it
        num_text_regions = len(text_detection_response) - 1
        similarity_ratio = [0.0]*num_text_regions
        print('############## ', image_filename)
        for i, text_region in enumerate(text_detection_response[1:]):
            text = text_region['description']
            similarity_ratio[i] = SequenceMatcher(None, expected_text.upper(), text.upper()).ratio()
            print(i, text, expected_text, similarity_ratio)

        best_match_similarity = max(similarity_ratio)
        best_match_index = similarity_ratio.index(best_match_similarity)

        text_region = text_detection_response[best_match_index + 1]
        box = [(v.get('x', 0.0), v.get('y', 0.0)) for v in text_region['boundingPoly']['vertices']]
        print(box)
        print(box + [box[0]])
        print(type(im), type(draw))
        if best_match_similarity == 1:
            color = '#00ff00'
            font_size = 0
        else:
            font_size = 80
            font = ImageFont.truetype("calibri.ttf", font_size)
            text_area = (box[0][0], box[0][1] - font_size - 10)
            draw.text(text_area, expected_text, (0,255,0), font=font)
            color = '#ff0000'
        draw.line(box + [box[0]], width=5, fill=color)
        
        box_array = np.asarray(box).reshape(4,2)
        minArgIndex = np.argmin(box_array, axis=0)
        maxArgIndex = np.argmax(box_array, axis=0)

        ### This logic is not clean: it assumes the paper is rotated in a specific way - this can be taken care of though by having checks
        ## Or it might make more sense to rotate it, crop it, then find contours - atleast now we know the paper has no rotation !
        top_left     = box_array[minArgIndex[0]]
        bottom_right = box_array[maxArgIndex[0]]

        top_right    = box_array[minArgIndex[1]]
        bottom_left  = box_array[maxArgIndex[1]]

        min_x = box_array[minArgIndex[0]][0]
        min_y = box_array[minArgIndex[1]][1]
        max_x = box_array[maxArgIndex[0]][0]
        max_y = box_array[maxArgIndex[1]][1]

        margin = 10
        cropped_im = im.crop((min_x - margin, min_y - 2*margin - font_size, 
                              max_x + margin, max_y + margin))

        # Get it into Aspect ratio - 4:3
#        crop_width, crop_height = cropped_im.size
#        crop_height = int((crop_width*3.0)/4.0)
        
#        resized_im = cropped_im.resize((crop_width, crop_height), resample=Image.LANCZOS)
#        resized_im.save(output_filename)
        cropped_im.save(output_filename)

    def detect_text_and_output_cropped_image(self, image_filename, expected_text, output_filename=None):
        responses = self.detect_text([image_filename])
        if not output_filename:
            output_filename = '.'.join(['_'.join([os.path.splitext(i)[0], 'processed']), 'jpg'])
        self.crop_and_highlight_texts(image_filename, responses[image_filename], expected_text, output_filename)

    def detect_label_and_output_string(self, image_filename):
        responses = self.detect_label([image_filename])
        # Expecting only a single response
        string_list = []
        threshold = 0.80
        for k, vs in responses.items():
            for v in vs:
                if v['score'] >= threshold:
                    string_list.append(v['description'])
            break
        return string_list

In [26]:
vision = VisionApi()
image = '../language_images/written_word_2_ball.jpg'
a = vision.detect_label_and_output_string(image)
print(a)

../language_images/written_word_2_ball.jpg
['text', 'handwriting', 'font']


In [18]:
vision = VisionApi()
image = '../language_images/written_word_2_ball.jpg'
#vision.detect_text_and_output_cropped_image(image, 'ball', 'correct_new3.jpg')
vision.detect_text_and_output_cropped_image(image, 'call', 'wrong_new.jpg')
#responses = vision.detect_text([image])
#crop_and_highlight_texts(image, responses[image], image_text[image][0], 'correct.jpg')

../language_images/written_word_2_ball.jpg
##############  ../language_images/written_word_2_ball.jpg
0 Ball call [0.75]
[(695, 354), (1119, 372), (1111, 547), (687, 529)]
[(695, 354), (1119, 372), (1111, 547), (687, 529), (695, 354)]
<class 'PIL.JpegImagePlugin.JpegImageFile'> <class 'PIL.ImageDraw.ImageDraw'>


In [16]:
images = ['../language_images/test/background.jpg', 
          '../language_images/test/white_background.jpg', 
          '../language_images/test/far_away_image.jpg', 
          '../language_images/test/reflection.jpg',
          '../language_images/test/too_much_text.jpg']

image_text = {}
image_text['../language_images/test/background.jpg'] = ['BALL']
image_text['../language_images/test/white_background.jpg'] = ['BALL']
image_text['../language_images/test/far_away_image.jpg'] = ['Akshay', 'Ball', 'Elephant']
image_text['../language_images/test/reflection.jpg'] = ['BALL']
image_text['../language_images/test/too_much_text.jpg'] = ['Introduction', 'desktop', 'described']

In [6]:
vision = VisionApi()
responses = vision.detect_text(images)

../language_images/test/background.jpg
../language_images/test/white_background.jpg
../language_images/test/far_away_image.jpg
../language_images/test/reflection.jpg
../language_images/test/too_much_text.jpg


In [11]:
for i in images:
    print('-------', i)
#    print(responses[i])
    output_filename = '_'.join([os.path.splitext(i)[0], 'output']) + '.jpg'
    vision.highlight_texts(i, responses[i], image_text[i][0], output_filename)

------- ../language_images/test/background.jpg
##############  ../language_images/test/background.jpg
0 BALL BALL [1.0]
[(857, 1018), (929, 876), (975, 900), (903, 1042)]
######## Going to save file  ../language_images/test/background_output.jpg
------- ../language_images/test/white_background.jpg
##############  ../language_images/test/white_background.jpg
0 BALL BALL [1.0]
[(793, 354), (1057, 324), (1066, 403), (802, 433)]
######## Going to save file  ../language_images/test/white_background_output.jpg
------- ../language_images/test/far_away_image.jpg
##############  ../language_images/test/far_away_image.jpg
0 Akshay Akshay [1.0, 0.0, 0.0, 0.0]
1 all Akshay [1.0, 0.2222222222222222, 0.0, 0.0]
2 03 Akshay [1.0, 0.2222222222222222, 0.0, 0.0]
3 Elephant Akshay [1.0, 0.2222222222222222, 0.0, 0.2857142857142857]
[(662, 1045), (816, 998), (826, 1029), (672, 1077)]
######## Going to save file  ../language_images/test/far_away_image_output.jpg
------- ../language_images/test/reflection.jpg

In [12]:
i = '../language_images/test/far_away_image.jpg'
output_filename = '_'.join([os.path.splitext(i)[0], 'output1']) + '.jpg'
vision.highlight_texts(i, responses[i], image_text[i][1], output_filename)

##############  ../language_images/test/far_away_image.jpg
0 Akshay Ball [0.2, 0.0, 0.0, 0.0]
1 all Ball [0.2, 0.8571428571428571, 0.0, 0.0]
2 03 Ball [0.2, 0.8571428571428571, 0.0, 0.0]
3 Elephant Ball [0.2, 0.8571428571428571, 0.0, 0.16666666666666666]
[(1344, 883), (1404, 864), (1413, 892), (1353, 911)]
######## Going to save file  ../language_images/test/far_away_image_output1.jpg


In [17]:
i = '../language_images/test/too_much_text.jpg'
output_filename = '_'.join([os.path.splitext(i)[0], 'output1']) + '.jpg'
vision.highlight_texts(i, responses[i], image_text[i][2], output_filename)

##############  ../language_images/test/too_much_text.jpg
0 Cambridge described [0.4444444444444444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [None]:
def preprocess_image_for_text_detection(image_filename):
    image = cv2.imread(image_filename, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    image_hsv_gray = image_hsv[:, :, 2]
    threshold, image_hsv_1_threshold = cv2.threshold(image_hsv[:, :, 1], 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    image_hsv_gray_2 = image_hsv_gray.copy()
    image_hsv_gray_2[image_hsv[:, :, 1] > 50] = 0
    image_hsv_gray_2[image_hsv[:, :, 2] < 150] = 0
    threshold, image_threshold = cv2.threshold(image_hsv_gray_2, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    image_threshold_c = image_threshold.copy()
    im2, contours, hierarchy = cv2.findContours(image_threshold_c, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) #cv2.RETR_TREE

    # Find index of longest contour
    maxLen = 0
    maxIndex = -1
    for i, contour in enumerate(contours):
        if len(contour) > maxLen:
            maxLen = len(contour)
            maxIndex = i
    print(maxIndex, maxLen)
    
    
    
    