In [None]:
import layoutparser as lp
import json
import numpy as np
from tqdm import tqdm
import pytesseract
import cv2

import matplotlib.pyplot as plt
%matplotlib inline
import os
from collections import defaultdict
from matplotlib import pyplot as plt

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR-5-0\tesseract.exe'
ocr_agent = lp.TesseractAgent(languages = 'eng')

In [None]:
import sys
sys.path.insert(1, '../../neuspell')
import neuspell
from neuspell import available_checkers, BertChecker

In [None]:
print(f'available checkers: {neuspell.available_checkers()}')

In [None]:
checker = BertChecker()
checker.from_pretrained()

In [None]:
def apply_brightness_contrast(input_img, brightness = 0, contrast = 0):
    
    if brightness != 0:
        if brightness > 0:
            shadow = brightness
            highlight = 255
        else:
            shadow = 0
            highlight = 255 + brightness
        alpha_b = (highlight - shadow)/255
        gamma_b = shadow
        
        buf = cv2.addWeighted(input_img, alpha_b, input_img, 0, gamma_b)
    else:
        buf = input_img.copy()
    
    if contrast != 0:
        f = 131*(contrast + 127)/(127*(131-contrast))
        alpha_c = f
        gamma_c = 127*(1-f)
        
        buf = cv2.addWeighted(buf, alpha_c, buf, 0, gamma_c)

    return buf

Load in Data from JSON

In [None]:
with open(r'../data/chronam_gt_coco.json', 'r') as infile:
    coco = json.load(infile)

In [None]:
print(coco.keys())

In [None]:
#Create a dict mapping images to a dict of associated annotations, keyed by anno id
image_boxes = defaultdict(dict)
for anno in coco['annotations']:
    image_boxes[anno['image_id']][anno['id']] = anno

Function to apply tesseract to each of the images, getting text from each annotated bounding box

In [None]:
def parse_newspaper_boxes(image_path, annotations):
    """
    Extract the text within each annotation section of a newspaper scan 
    
    input: image_path: path to image
            annotations: list of annotation objects to transcribe text from
            
    return: 
    """
    image = cv2.imread(f'../data/images/{image_path}')
    image = apply_brightness_contrast(image, contrast = 30)
    
    res = ocr_agent.detect(image, return_response=True)
    layout = ocr_agent.gather_data(res, agg_level=lp.TesseractFeatureType.WORD)
    
    ano_texts = []
    for i, ano in enumerate(annotations):
        
        x0, y0, w, h = ano['bbox']
        box_text = layout.filter_by(
            lp.Rectangle(x_1=x0, y_1=y0, x_2=x0 + w, y_2=y0 + h)
        ).get_texts()
        
        if not box_text:
            box_text = ''
        else:
            box_text = ' '.join(box_text)
        
        ano_texts.append({'ano_id': ano['id'],
                          'text': box_text})        
    return {ano['ano_id']: ano['text'] for ano in ano_texts}

In [None]:
image_ids = []
image_ocr_results = {}
for image in tqdm(coco['images']):
    image_ids.append(image['id'])
    image_ocr_results[image['id']] = parse_newspaper_boxes(image['file_name'], image_boxes[image['id']].values())

Save the transcribed results to file

In [None]:
with open(r'../data/image_ocr_results.json', 'w', encoding = 'utf8') as outfile:
    json.dump(image_ocr_results, outfile, indent = 4)

In [None]:
print(image_ocr_results[0][107])