## Importing required libraries

In [None]:
import cv2
import pytesseract
from PIL import Image
import numpy as np
import os
import pandas as pd 
import re

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

## Utility functions

In [None]:
def get_labels(img):
    # Resolutions may vary. Need to adjust according to the given dataset.
    left_label = img[50:330, 50:1200]
    right_label = img[50:350, 3900:img.shape[1]]
    return left_label, right_label

In [None]:
def validate_right_label(right_label_text):
    print("Original Right Label Text: " + right_label_text)
    try:
        # Remove extra noise from right label text
        label = re.sub(r"^[^A-Z]*", "", right_label_text)
        # Desired Format - [Alphabetical Character - 2 or 3]-[One Number followed by One Alphabetical Character]-[Number - 1 to 3]
        # Confused characters - 1 confused as I, 0 is confused as G
        confused_characters = {
            'G': '0',
            'I': '1',
        }
        # At this point, label may be read like this: IN-3B 166
        # Before splitting, add hyphens to the label
        label = label.replace(' ', '-')
        
        first_part = label.split('-')[0]
        second_part = label.split('-')[1]
        third_part = label.split('-')[2]
        #In first part, all should be alphabeticals (alphabetical char is key, number that may need to be replaced is value)
        for i in range(len(first_part)):
            if first_part[i] in confused_characters.values():
                first_part = first_part.replace(first_part[i], list(confused_characters.keys())[list(confused_characters.values()).index(first_part[i])])
        #In second part, first should be a number, second should be an alphabetical character
        for i in range(len(second_part)):
            if i == 0:
                if second_part[i] in confused_characters.keys():
                    second_part = second_part.replace(second_part[i], confused_characters[second_part[i]])
            else:
                if second_part[i] in confused_characters.values():
                    second_part = second_part.replace(second_part[i], list(confused_characters.keys())[list(confused_characters.values()).index(second_part[i])])
        #In third part, all should be numbers
        for i in range(len(third_part)):
            if third_part[i] in confused_characters.keys():
                third_part = third_part.replace(third_part[i], confused_characters[third_part[i]])

        if len(first_part) < 2 or len(second_part) < 1 or len(third_part) < 1:
            raise Exception("Right label is not in the correct format")
        
        return first_part + '-' + second_part + '-' + third_part
    except Exception as e:
        print("Exception: " + str(e))
        return "Error"

In [None]:
def validate_left_label(left_label_text):
    try:
        print("Original left label text: " + left_label_text)
        #Removes all newlines and carriage returns
        left_label_text = re.sub(r'\n|\r', '', left_label_text)
        left_label_text = left_label_text.replace("'", "")
        #Replaces whitespace with hyphen
        left_label_text = left_label_text.replace(' ', '-')
        #if string is empty, raises exception
        if len(left_label_text) < 1:
            raise Exception("Left label is empty")
        
        return left_label_text
    except Exception as e:
        print("Exception: " + str(e))
        return "Error"

In [None]:
# Credit to https://github.com/yardstick17/image_text_reader
import cv2
import logging
import numpy as np
from PIL import Image
import tempfile

IMAGE_SIZE = 1800
BINARY_THREHOLD = 180

size = None


def get_size_of_scaled_image(im):
    global size
    if size is None:
        length_x, width_y = im.size
        factor = max(1, int(IMAGE_SIZE / length_x))
        size = factor * length_x, factor * width_y
    return size


def process_image_for_ocr(file_path):
    temp_filename = set_image_dpi(file_path)
    im_new = remove_noise_and_smooth(temp_filename)
    return im_new


def set_image_dpi(file_path):
    im = Image.open(file_path)
    # size = (1800, 1800)
    size = get_size_of_scaled_image(im)
    im_resized = im.resize(size, Image.ANTIALIAS)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
    temp_filename = temp_file.name
    im_resized.save(temp_filename, dpi=(300, 300))  # best for OCR
    return temp_filename

def image_smoothening(img):
    ret1, th1 = cv2.threshold(img, BINARY_THREHOLD, 255, cv2.THRESH_BINARY)
    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    blur = cv2.GaussianBlur(th2, (1, 1), 0)
    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th3


def remove_noise_and_smooth(file_name):
    logging.info('Removing noise and smoothening image')
    img = cv2.imread(file_name, 0)
    # filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
    img = image_smoothening(img)
    or_image = cv2.bitwise_or(img, closing)
    return or_image

## Tesseract OCR

In [None]:
df = pd.DataFrame(columns=['filename', 'left_label', 'right_label'])
for file_name in os.listdir('input'):
    try:
        original_file_name = file_name
        file_name = os.path.join('input', file_name)
        print("Processing: " + file_name)
        img = cv2.imread(file_name)
        # Cropping out right label area and left label area as separate pictures.
        left_label_img, right_label_img = get_labels(img)

        cv2.imwrite('left_label.jpg', left_label_img)
        cv2.imwrite('right_label.jpg', right_label_img)

        #preprocess image for OCR
        left_label_img = process_image_for_ocr('left_label.jpg')
        right_label_img = process_image_for_ocr('right_label.jpg')

        cv2.imwrite('left_label.jpg', left_label_img)
        cv2.imwrite('right_label.jpg', right_label_img)

        # Running PyTesseract to extract text from two labels.
        left_label_text, right_label_text = pytesseract.image_to_string(Image.open('left_label.jpg')), pytesseract.image_to_string(Image.open('right_label.jpg'))
        left_label_text = validate_left_label(left_label_text)
        right_label_text = validate_right_label(right_label_text)
        print("Left label: " + left_label_text)
        print("Right label: " + right_label_text + '\n')

        # Renaming original files with extracted labels.
        df = pd.concat([df, pd.DataFrame([[original_file_name, left_label_text, right_label_text]], columns=['filename', 'left_label', 'right_label'])])
        file_name = left_label_text + " && " + right_label_text + "." + file_name.split('.')[1]
        # Saving file to output folder
        cv2.imwrite(os.path.join('output_pytesseract', file_name), img)

        # Deleting temporary files.
        os.remove('left_label.jpg')
        os.remove('right_label.jpg')

    except Exception as e:
        print("Error processing: " + file_name)
        print(e)
        continue

In [None]:
df.to_csv('./test/pytesseract_ocr_results.csv', index=False)
print("File processing completed.")