3. Exploring possible improvements for enhancing text recognition quality

In [1]:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
from msrest.authentication import CognitiveServicesCredentials
import os
from PIL import Image, ImageFilter, ImageEnhance
import time

'''
Authenticate
Authenticates your credentials and creates a client.
'''
subscription_key = os.environ["VISION_KEY"]
endpoint = os.environ["VISION_ENDPOINT"]
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
'''
END - Authenticate
'''


def preprocess_image_to_grayscale(image_path):
    img = Image.open(image_path)
    img = img.convert("L")
    img = img.filter(ImageFilter.MedianFilter())

    contrast_enhancer = ImageEnhance.Contrast(img)
    img = contrast_enhancer.enhance(2)

    sharpness_enhancer = ImageEnhance.Sharpness(img)
    img = sharpness_enhancer.enhance(2)

    preprocessed_path = "data/images/preprocessed_image.jpg"
    img.save(preprocessed_path)
    return preprocessed_path

def resize_image(image_path, max_size=(500, 500)):
    imgg = Image.open(image_path)
    imgg.thumbnail(max_size)
    imgg.save("resized_image.jpg")


image_path = "data/images/test2.jpeg"

# Using a preprocessed image to grayscale
processed_image_path = preprocess_image_to_grayscale(image_path)

# Using a resized image
# ---------------------
#resize_image(image_path)
#processed_image_path = "resized_image.jpg"

img = open(processed_image_path, "rb")
read_response = computervision_client.read_in_stream(
    image=img,
    mode="Printed",
    raw=True
)
operation_id = read_response.headers['Operation-Location'].split('/')[-1]
while True:
    read_result = computervision_client.get_read_result(operation_id)
    print(read_result.status)
    if read_result.status not in ['notStarted', 'running']:
        break
    time.sleep(1)

result = []
if read_result.status == OperationStatusCodes.succeeded:
    for text_result in read_result.analyze_result.read_results:
        for line in text_result.lines:
            result.append(line.text)
print(result)



OperationStatusCodes.succeeded
['Succes in resolvarea', 'TEMELOR la', 'LABORA toarele de', 'Inteligentà Artificialà!']


In [3]:
import Levenshtein

true_text = ("Succes în rezolvarea "
             "tEMELOR la "
             "LABORAtoarele de "
             "Inteligență Artificială!")

"""true_text = ("Google Cloud "
             "Platform")"""
recognized_text = " ".join(w for w in result)
char_error_rate = Levenshtein.distance(true_text, recognized_text) / len(true_text)

true_words = true_text.split()
recognized_words = recognized_text.split()

word_error_rate = Levenshtein.distance(true_words, recognized_words) / len(true_words)

print(f"CER: {char_error_rate:.2%}", f"WER: {word_error_rate:.2%}")


CER: 9.59% WER: 77.78%


Before optimization: CER: 10.96% WER: 88.89% ( Pb1.ipynb - 1.a )

After optimization: CER: 9.59% WER: 77.78%