In [1]:
# for convert PDF to JPG
from wand.image import Image as WandImage
from PyPDF2 import PdfReader
# ML based OCR 
import pytesseract  
from PIL import Image
# for progress bar
from tqdm import tqdm
# other
import os

In [2]:
# config paths
input_path = './pdf/'
output_path = './jpg/'
fn = "平成30年ー平成31年度船舶使用願.pdf"

In [3]:
def pdfToImages(pdf_path, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)

        # Change to the total number of pages in your PDF
        total_pages = 3
        for page_number in tqdm(range(0, total_pages), desc="Processing Pages"):
            # Convert each page to an image
            with WandImage(filename=f'{pdf_path}[{page_number}]', resolution=400) as img:
                # resize the image to ensure high resolution: also resol=300
                img.resize(width=2 * img.width, height=2 * img.height)
                # Save the image as JPG in the output folder
                img.save(filename=os.path.join(output_folder, f'page_{page_number+1}.jpg'))
        print(f"Total page is {total_pages}, so exit the program.")
    return

def OCRImage(image_path, config=None):
    # OCR on the image
    image = Image.open(image_path)
    if config is not None:
        text = pytesseract.image_to_string(image, lang='jpn', config=config)
    else:
        # else default
        text = pytesseract.image_to_string(image, lang='jpn')
    #print(f"Text:\n{text}\n")
    return text

In [4]:
# reading pdf and converting to jpg: testing 3 cases first
pdfToImages(pdf_path=input_path+fn, output_folder=output_path)

Processing Pages: 100%|██████████████████████████████████████████████████████████████████| 3/3 [00:28<00:00,  9.37s/it]

Total page is 3, so exit the program.





In [5]:
# read text from img: testing page 1 which is PC typed
text = OCRImage(image_path='./jpg/page_1.jpg')
# note: make sure jpn.traineddata is downloaded and placed to Tesseract-OCR/tessdata/ folder.
#    -  jpn.traineddata: it is downloaded from GitHub as a pre-trained JPN data (users can train their own dataset).
# note: TESSDATA_PREFIX should direct to location of tessdata (just in case)
###############################################################################
#print(text)

In [6]:
# read text from img: testing page 2 & 3 which is hand written
text = OCRImage(image_path='./jpg/page_2.jpg', config='--psm 11 --oem 3')
# note: changing resolution (300 -> 400) can affect the results
# --psm N: N from 0 to 13, psm configurates structure of img
# --oem N: N from 0 to 3, oem congifurates OCR engine mode
###############################################################################
#print(text)

In [7]:
# read text from img: testing page 2 & 3 which is hand written
text = OCRImage(image_path='./jpg/page_3.jpg')
###############################################################################
#print(text)

In [8]:
# Next steps
# 1) use Google Cloud OCR or Azure OCR: AI-based recognition API
# GC-OCR: https://cloud.google.com/vision/docs/ocr
# 2) trained my own JPN dataset
# 3) fine-tuning

In [1]:
from google.cloud import vision
# TODO: google cloud CLI configuration

ModuleNotFoundError: No module named 'google'

In [None]:
# sample code 
def detect_document(path):
    """Detects document features in an image."""


    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            print(f"\nBlock confidence: {block.confidence}\n")

            for paragraph in block.paragraphs:
                print("Paragraph confidence: {}".format(paragraph.confidence))

                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    print(
                        "Word text: {} (confidence: {})".format(
                            word_text, word.confidence
                        )
                    )

                    for symbol in word.symbols:
                        print(
                            "\tSymbol: {} (confidence: {})".format(
                                symbol.text, symbol.confidence
                            )
                        )

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )

