Tesseract says it works best on documents that are pre-processed; however, it does include significant processing to assist it. In the below, we will use CRAFT to detect image rotation and then rotate images. Then we will re-OCR the data.

In [1]:
# https://pypi.org/project/craft-text-detector/
# https://ai.plainenglish.io/how-to-extract-texts-from-rotated-skewed-text-images-using-craft-opencv-and-pytesseract-9c8c3fb8ef9d
from craft_text_detector import Craft 
import cv2
import json 
import math 
import multiprocessing
import numpy as np 
import pandas as pd
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
from pytesseract import image_to_data # https://github.com/UB-Mannheim/tesseract/wiki
from pytesseract import Output 
import shutil

import sys
sys.path.append(str(Path.cwd().parent.parent))
from utils import create_fh_logger

In [2]:
good_docs =  Path.cwd().parent.parent.parent.parent / 'processing' / 'nro_declassified' / 'good_docs'
imgs = good_docs.parent / 'imgs'
imgs_dest = imgs.parent / "imgs_rotated"
imgs_dest.mkdir(exist_ok=True)
ocr_dest = imgs.parent / "ocr_rotated"
ocr_dest.mkdir(exist_ok=True)
east_weights = imgs.parent.parent / 'models' / 'frozen_east_text_detection.pb'

logs = imgs_dest.parent.parent / 'logs'
logs.mkdir(exist_ok=True)

logger = create_fh_logger(logs / "ocr_img_rotated.log")

In [3]:
craft_detector = Craft( crop_type="poly" , cuda = True, text_threshold=0.8, link_threshold=0.4, low_text=0.25)

ImportError: cannot import name 'model_urls' from 'torchvision.models.vgg' (c:\Users\brasw\AppData\Local\Programs\Python\Python39\lib\site-packages\torchvision\models\vgg.py)

In [4]:
import torchvision #Below monkey patches torchvision to use the model URLs found in: https://github.com/clovaai/CRAFT-pytorch/issues/191
torchvision.models.vgg.model_urls = {
    'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', 
    'vgg13': 'https://download.pytorch.org/models/vgg13-c768596a.pth', 
    'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', 
    'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', 
    'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', 
    'vgg13_bn': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth', 
    'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', 
    'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth'
    }

In [5]:
craft_detector = Craft(crop_type="poly" , cuda = True, text_threshold=0.8, link_threshold=0.4, low_text=0.25)



Craft text detector weight will be downloaded to C:\Users\brasw\.craft_text_detector\weights\craft_mlt_25k.pth


FileURLRetrievalError: Failed to retrieve file url:

	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1bupFXqT-VU6Jjeul13XP7yx2Sg5IHr4J

but Gdown can't. Please check connections and permissions.

It can't be accessed in the browser either. The model weights are stored on Google Drive & they have exceeded their usage the past 24 hours... repeatedly.

In [3]:
# lets read in the files we will continue to work on
with open( good_docs / 'analyze.json','r') as f:
    docs = json.loads(f.read())['documents']

In [9]:
# https://github.com/opencv/opencv/blob/7fb70e170154d064ef12d8fec61c0ae70812ce3d/samples/dnn/text_detection.py
def decode(scores, geometry, scoreThresh):
    detections = []
    confidences = []
    height = scores.shape[2]
    width = scores.shape[3]
    for y in range(0, height):
        scoresData = scores[0][0][y]
        x0_data = geometry[0][0][y]
        x1_data = geometry[0][1][y]
        x2_data = geometry[0][2][y]
        x3_data = geometry[0][3][y]
        anglesData = geometry[0][4][y]
        for x in range(0, width):
            score = scoresData[x]
            if(score < scoreThresh):
                continue
            offsetX = x * 4.0
            offsetY = y * 4.0
            angle = anglesData[x]
            cosA = math.cos(angle)
            sinA = math.sin(angle)
            h = x0_data[x] + x2_data[x]
            w = x1_data[x] + x3_data[x]
            offset = ([offsetX + cosA * x1_data[x] + sinA * x2_data[x], offsetY - sinA * x1_data[x] + cosA * x2_data[x]])
            p1 = (-sinA * h + offset[0], -cosA * h + offset[1])
            p3 = (-cosA * w + offset[0],  sinA * w + offset[1])
            center = (0.5*(p1[0]+p3[0]), 0.5*(p1[1]+p3[1]))
            detections.append((center, (w,h), -1*angle * 180.0 / math.pi))
            confidences.append(float(score))
    return [detections, confidences]
import os 
def east_detect(img_folder):
    resize_hw = 640
    conf_threshold = .2
    nms_threshold=.2
    available_rotations = [cv2.ROTATE_180, cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE, -1]
    rotations = {key: 0 for key in available_rotations} # lets use this as an accumulator to count the amount of text matches per rotation to be used as our final determinator for how to rotate the image
    net = cv2.dnn.readNetFromTensorflow(str(east_weights))
    net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
    net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
    outNames = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
    for image_file in list(Path(img_folder).glob('*png')):
        name = image_file.name
        image = cv2.imread(str(image_file))
        if image is not None:
            for rotation in available_rotations:
                frame = image.copy()
                frame = cv2.rotate(frame, rotation) if rotation >=0 else frame
                height_ = frame.shape[0]
                width_ = frame.shape[1]
                rW = width_ / float(resize_hw)
                rH = height_ / float(resize_hw)
                blob = cv2.dnn.blobFromImage(frame, 1.0, (resize_hw, resize_hw), (123.68, 116.78, 103.94), True, False)
                net.setInput(blob)
                outs = net.forward(outNames)

                # Get scores and geometry
                scores = outs[0]
                geometry = outs[1]
                [boxes, confidences] = decode(scores, geometry, conf_threshold)
                indices = cv2.dnn.NMSBoxesRotated(boxes, confidences, conf_threshold, nms_threshold)
                rotations[rotation] += len(indices)
        else:
            rotations[-1] = 50# just make it not do anything
    return rotations 

def ocr(imgs_dest, ocr_dest):
    logger.info('Entering ocr loop.')
    for item in imgs_dest.iterdir(): # could be a file ig if something went wrong
        translation = {}
        try: # there are some unallowed characters and conditions in here that haven't all been identified
            if item.is_dir: # what we're after 
                pdf_name = ocr_dest.stem
                file_name = ocr_dest / f'{pdf_name}.json'
                pgs = list(item.glob('*png'))
                logger.info(f"Starting Translation on {pdf_name.encode('utf-8')} with {len(pgs)} pages.")
                pg_translations = {}
                for pg in pgs:
                    pg_num = pg.stem
                    pg = Image.open(pg)
                    parsed = image_to_data(pg, output_type = Output.DICT)
                    pg_translations[pg_num] = parsed
                translation[pdf_name] = pg_translations
                logger.info(f"Saving Translation for {pdf_name.encode('utf-8')} as a JSON.")
                with open (file_name, 'w') as f:
                    json.dump(translation, f)
        except Exception as e:
            logger.info(e)  
    logger.info('Exiting ocr loop')

def process_img(file: Path):
    img_folder = imgs / file
    rotations = east_detect(img_folder) # this is a dict in the form: rotate 180, rotate 90 clockwise, rotate 90 counterclockwise, leave the same [-1]
    rotation = max(rotations, key=rotations.get) 
    save_loc = imgs_dest / file.name
    save_loc.mkdir(exist_ok=True, parents=True)
    if rotation != -1 and rotation != 1: # lets not allow 180s
        # make the directory
        logger.info(f"Rotating {str(img_folder).encode('utf-8')}")
        for image_file in list(Path(img_folder).glob('*png')):
            frame = cv2.imread(str(image_file))
            frame = cv2.rotate(frame, rotation)
            # lets save this 
            s = str(save_loc / f'{image_file.name}{image_file.suffix}')
            cv2.imwrite(s, frame)
    else:
        # we just need to save off the img folder
        saved = shutil.copytree(str(img_folder), str(save_loc), dirs_exist_ok=True)
        logger.info(f'Saved entire tree: {saved}, {str(save_loc)}')
    #ocr(imgs_dest, ocr_dest)

In [None]:
img_dests = []
for doc in docs:
    doc = doc[:10] + "_" + doc[10:]
    # lets double check the name is a match:
    # some naming inconsistencies with the json ocr versus img (namely the _)
    for img_folder in imgs.iterdir():
        if doc[:25] == img_folder.name[:25]:
            # lets keep this 
            doc = doc 
            break 
    pdf_img = imgs / doc 
    img_dests.append(pdf_img)
    process_img(pdf_img)