In [None]:
## Importing necessary libraries
import cv2
from google.cloud import storage
from google.cloud import vision
import os
import pandas as pd 
import re

### Initializing Google Bucket

In [None]:
bucket_name = 'atu-ocr'
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
if not bucket.exists():
    bucket = storage_client.create_bucket(bucket_name)
else:
    bucket = storage_client.get_bucket(bucket.name)

## Utility functions

### Extracting left and right labels from the given image.

In [None]:
def get_labels(img):
    # Resolutions may vary. Need to adjust according to the given dataset.
    left_label = img[50:330, 50:1200]
    right_label = img[50:350, 3900:img.shape[1]]
    return left_label, right_label

### Extracting text from given image. Parameter "uri" is the Google Cloud Storage URI of the image.

In [None]:
def detect_text_uri(uri):
    # Taken from Google Cloud Vision API documentation
    """Detects text in the file located in Google Cloud Storage or on the Web.
    """
    client = vision.ImageAnnotatorClient()
    image = vision.Image()
    image.source.image_uri = uri

    response = client.text_detection(image=image)
    texts = response.text_annotations

    if response.error.message:
        raise Exception(
            '{}\nFor more info on error messages, check: '
            'https://cloud.google.com/apis/design/errors'.format(
                response.error.message))
    return texts[0].description

### Correcting OCR errors in right label text.

In [None]:
def validate_right_label(right_label_text):
    try:
        # Removes extra noise from right label text
        label = re.sub(r"^[^A-Z]*", "", right_label_text)
        # Desired Format - [Alphabetical Character - 2 or 3]-[One Number followed by One Alphabetical Character]-[Number - 1 to 3]
        confused_characters = {
            'G': '0',
            'I': '1',
        }
        # At this point, label may be read like this: IN-3B 166
        # Before splitting, adding hyphens to the label
        label = label.replace(' ', '-')
        
        first_part = label.split('-')[0]
        second_part = label.split('-')[1]
        third_part = label.split('-')[2]
        #In first part, all should be alphabeticals (alphabetical char is key, number that may need to be replaced is value)
        for i in range(len(first_part)):
            if first_part[i] in confused_characters.values():
                first_part = first_part.replace(first_part[i], list(confused_characters.keys())[list(confused_characters.values()).index(first_part[i])])
        #In second part, first should be a number, second should be an alphabetical character
        for i in range(len(second_part)):
            if i == 0:
                if second_part[i] in confused_characters.keys():
                    second_part = second_part.replace(second_part[i], confused_characters[second_part[i]])
            else:
                if second_part[i] in confused_characters.values():
                    second_part = second_part.replace(second_part[i], list(confused_characters.keys())[list(confused_characters.values()).index(second_part[i])])
        #In third part, all should be numbers
        for i in range(len(third_part)):
            if third_part[i] in confused_characters.keys():
                third_part = third_part.replace(third_part[i], confused_characters[third_part[i]])

        if len(first_part) < 2 or len(second_part) < 1 or len(third_part) < 1:
            raise Exception("Right label is not in the correct format")
        
        return first_part + '-' + second_part + '-' + third_part
    except Exception as e:
        print("Exception: " + str(e))
        return "Error"

### Correcting OCR errors in left label text.

In [None]:
def validate_left_label(left_label_text):
    #Removes all newlines and carriage returns
    left_label_text = re.sub(r'\n|\r', '', left_label_text)
    left_label_text = left_label_text.replace("'", "")
    #Replaces whitespace with hyphen
    left_label_text = left_label_text.replace(' ', '-')
    
    return left_label_text

### Iterating over files in "data" folder

In [None]:
df = pd.DataFrame(columns=['filename', 'left_label', 'right_label'])
for file_name in os.listdir('input'):
    try:
        original_file_name = file_name
        file_name = os.path.join('input', file_name)
        print("Processing: " + file_name)
        img = cv2.imread(file_name)
        # Cropping out right label area and left label area as separate pictures.
        left_label_img, right_label_img = get_labels(img)
        cv2.imwrite('left_label.jpg', left_label_img)
        cv2.imwrite('right_label.jpg', right_label_img)

        # Uploading temporary cropped out pictures to Google Cloud Storage for further processing.
        blob = bucket.blob("left_label.jpg")
        blob.upload_from_filename("left_label.jpg")
        blob = bucket.blob("right_label.jpg")
        blob.upload_from_filename("right_label.jpg")
        left_label_gs_uri, right_label_gs_uri = 'gs://' + bucket_name + '/left_label.jpg', 'gs://' + bucket_name + '/right_label.jpg'

        # Running Google Cloud Vision OCR API to extract text from two labels.
        left_label_text, right_label_text = detect_text_uri(left_label_gs_uri), detect_text_uri(right_label_gs_uri)
        left_label_text = validate_left_label(left_label_text)
        right_label_text = validate_right_label(right_label_text)
        print("Left label: " + left_label_text)
        print("Right label: " + right_label_text + '\n')

        # Renaming original files with extracted labels.
        df = pd.concat([df, pd.DataFrame([[original_file_name, left_label_text, right_label_text]], columns=['filename', 'left_label', 'right_label'])])
        file_name = left_label_text + " && " + right_label_text + "." + file_name.split('.')[1]
        # Saving file to output folder
        cv2.imwrite(os.path.join('output', file_name), img)

        # Deleting temporary files.
        os.remove('left_label.jpg')
        os.remove('right_label.jpg')

    except Exception as e:
        print("Error processing: " + file_name)
        print(e)
        continue

In [None]:
df.to_csv('./test/google_ocr_results.csv', index=False)
print("File processing completed.")