# Setup
1) need to download modules included below
2) install ImagerMagick following the instrution in website of wand.
3) install google.cloud.vision -> configurate your credentials and private key for connection

# NOTE: similar code as "Hand-written_OCR" using Google Vision API

API reference: https://cloud.google.com/python/docs/reference/vision/latest

Steps:
1) convert PDF to JPG
2) call GV API to read data
3) output data: i) folder same naming as PDF file, ii) pages stored saparately in the folder

In [2]:
# for convert PDF to JPG
from wand.image import Image as WandImage
from PyPDF2 import PdfReader
# ML based OCR 
import pytesseract  
from PIL import Image
# for progress bar
from tqdm import tqdm
# other
import os

In [6]:
# config paths
input_path = './pdf/STD/'
output_path = './jpg/STD/'
fn = ""  # loop over all files under input_path

In [10]:
# first, output all files under "./pdf/STD/" folder
def list_files(directory):
    file_list = []
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(directory):
        # Iterate over each file in the current directory
        for file in files:
            # Get the file names
            #file_path = os.path.join(root, file)  #do not need root path
            file_path = file
            # Append the file path to the list
            file_list.append(file_path)
    return file_list

# run
STD_files = list_files(directory=input_path)
print(f"Files of STD: {STD_files}")

Files of STD: ['H19_1月~3月.pdf', 'H19_4月.pdf', 'H19_5月.pdf', 'H19_6月~7月.pdf', 'H19_8月(2).pdf', 'H_198月.pdf']


In [9]:
# define functions to convert PDF to JPG
def pdfToImages(pdf_path, output_folder, total_pages=None):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)

        # Change to the total number of pages in your PDF
        #total_pages = 3
        if total_pages is None:
            total_pages = len(pdf_reader.pages)
        for page_number in tqdm(range(0, total_pages), desc="Processing Pages"):
            # Convert each page to an image
            with WandImage(filename=f'{pdf_path}[{page_number}]', resolution=400) as img:
                # resize the image to ensure high resolution: also resol=300
                img.resize(width=2 * img.width, height=2 * img.height)
                # Save the image as JPG in the output folder
                img.save(filename=os.path.join(output_folder, f'page_{page_number+1}.jpg'))
        print(f"Total page is {total_pages}, so exit the program.")
    return

def OCRImage(image_path, config=None):
    # OCR on the image
    image = Image.open(image_path)
    if config is not None:
        text = pytesseract.image_to_string(image, lang='jpn', config=config)
    else:
        # else default
        text = pytesseract.image_to_string(image, lang='jpn')
    #print(f"Text:\n{text}\n")
    return text

TODO: proceed and output all files instead of only 1 for testing purpose

In [48]:
# reading pdf and converting to jpg: all pages 
# test: 1st file
fn = STD_files[0]
folder = fn.replace(".pdf","")
pdfToImages(pdf_path=input_path+fn, output_folder=output_path+fn, total_pages=None)

Processing Pages:   0%|                                                                         | 0/78 [00:07<?, ?it/s]


KeyboardInterrupt: 

# Apply Google Vision API

In [12]:
# import google libraries
from google.cloud import vision
import pandas as pd
import os
from tqdm import tqdm

In [122]:
# functions for reading images, returning information in the images 
def detect_document(path,lanHint='ja'):
    """Detects document features in an image."""
    client = vision.ImageAnnotatorClient()
    with open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    # add crop hint & language hint (ja) -> looks better
    crop_hints_params = vision.CropHintsParams(aspect_ratios=[0.8, 1.0, 1.2])
    image_context = vision.ImageContext(crop_hints_params=crop_hints_params, language_hints=[lanHint])
    #image_context = vision.ImageContext(crop_hints_params=crop_hints_params) 
    # if not lan hint, results will be strange
    # note: language_hints NOT languageHints becoz it is RPC API (https://github.com/googleapis/google-cloud-python/issues/6387)
    response = client.document_text_detection(image=image,image_context=image_context)
    #response = client.text_detection(image=image,image_context=image_context)  # use text_detection
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return image,response


def print_response(response):
    for page in response.full_text_annotation.pages:
        print(f"Current page has {len(page.blocks)} blocks")
        nBlock = 1
        for block in page.blocks:
            block_text = []
            print("####################################")
            print(f'Current block: {nBlock}')
            print(f"\nBlock confidence: {block.confidence}\n")
            for paragraph in block.paragraphs:
                paragraph_text = []
                print("Paragraph confidence: {}".format(paragraph.confidence))
                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    paragraph_text.append(word_text)
                block_text = ''.join(paragraph_text)
                print(block_text)
                result_str = ''.join(block_text)
                print(result_str)
            nBlock += 1
    return

def print_full_text(response):
    print(f"Full Text: {response.full_text_annotation.text}")
    return

# check encoding of the result -> if not Shift-JIS, return False
def is_not_encoding(string,check='cp932'):
    try:
        # Try to decode the string using 'cp932' encoding
        decoded_string = string.encode(check)
    except UnicodeEncodeError:
        # If encoding raises an error, the string is not 'cp932'
        return True
    else:
        # If encoding is successful, the string is 'cp932'
        return False

def save_full_text(path, fn1, fn2, response):
    # Open the file in write mode
    with open(path+fn1, "w") as file:
        # Write each element of the list to a new line in the file
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_text = []
                for paragraph in block.paragraphs:
                    paragraph_text = []
                    for word in paragraph.words:
                        word_text = "".join([symbol.text for symbol in word.symbols])
                        paragraph_text.append(word_text)
                    block_text = ''.join(paragraph_text)
                    result_str = ''.join(block_text)
                    # testing
                    #print(result_str)
                    if is_not_encoding(string=result_str):
                        print(f"Wrong encoding: {result_str}")
                    else:
                        file.write(f"{result_str}\n")
                
    with open(path+fn2,"w") as file:
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_confidence = []
                block_confidence.append(block.confidence)
                for paragraph in block.paragraphs:
                    paragraph_confidence = []
                    paragraph_confidence.append(paragraph.confidence)
                file.write(f"{block_confidence}\n")
    #print(f"List contents saved to {path+fn1}")
    #print(f"List confidence saved to {path+fn2}")
    
# read all image and save txt (page_x.txt && page_x_confidence.txt) to directory
def save_all_full_text(directory_path,save_path):
    # Loop over all image files in the directory
    for filename in tqdm(os.listdir(directory_path),desc='Processing jpg: '):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            # set up image path
            image_path = os.path.join(directory_path, filename)
            # detect OCR using vision API
            try:
                image,response = detect_document(image_path)
            except:
                print(f"An execution error encountered at {filename}")
            # then save all OCR text to .txt file with confidence -> remove .jpg from filename
            savename = filename.replace('.jpg','')
            save_full_text(path=save_path,fn1=f'{savename}.txt',fn2=f'{savename}_confidence.txt',response=response)
            # testing
            #print_response(response=response)
    return

In [90]:
# once the key is set up and downloaded, config the environmental variables
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:/service_account_PubKey.json'
# set paths: save txt file to same folder as jpg
fn_0 = STD_files[0]
folder_0 = fn.replace(".pdf","")
jpg_path = f'./jpg/STD/{folder_0}/'

In [120]:
# testing page_22, page_23
testing_path = f'./jpg/STD/testing/'
save_all_full_text(directory_path=testing_path,save_path=testing_path)
# also testing page_29-31, why bad image data?

Processing jpg: 100%|████████████████████████████████████████████████████████████████████| 5/5 [00:44<00:00,  8.98s/it]


In [123]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   7%|████▊                                                             | 9/122 [00:28<07:59,  4.24s/it]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:  10%|██████▍                                                          | 12/122 [00:39<07:26,  4.06s/it]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:  12%|███████▉                                                         | 15/122 [00:46<06:13,  3.49s/it]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  34%|██████████████████████▍                                          | 42/122 [01:54<03:48,  2.86s/it]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  42%|███████████████████████████▏                                     | 51/122 [02:10<02:32,  2.15s/it]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  55%|███████████████████████████████████▋                             | 67/122 [03:05<03:02,  3.32s/it]

An execution error encountered at page_3.jpg


Processing jpg:  64%|█████████████████████████████████████████▌                       | 78/122 [04:20<04:51,  6.64s/it]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  66%|██████████████████████████████████████████▌                      | 80/122 [04:43<06:44,  9.62s/it]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  69%|████████████████████████████████████████████▊                    | 84/122 [05:24<07:07, 11.26s/it]

Wrong encoding: •
Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  73%|███████████████████████████████████████████████▍                 | 89/122 [06:23<06:34, 11.95s/it]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  74%|███████████████████████████████████████████████▉                 | 90/122 [06:27<05:12,  9.78s/it]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  75%|█████████████████████████████████████████████████                | 92/122 [06:41<04:02,  8.10s/it]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  77%|██████████████████████████████████████████████████               | 94/122 [06:58<03:46,  8.08s/it]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  82%|████████████████████████████████████████████████████▍           | 100/122 [07:45<02:33,  6.96s/it]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  84%|█████████████████████████████████████████████████████▌          | 102/122 [08:00<02:20,  7.01s/it]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  88%|████████████████████████████████████████████████████████▏       | 107/122 [08:44<02:09,  8.65s/it]

Wrong encoding: 6760LÀ


Processing jpg:  89%|████████████████████████████████████████████████████████▋       | 108/122 [08:50<01:47,  7.68s/it]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  89%|█████████████████████████████████████████████████████████▏      | 109/122 [09:00<01:50,  8.51s/it]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  91%|██████████████████████████████████████████████████████████▏     | 111/122 [09:15<01:31,  8.31s/it]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  95%|████████████████████████████████████████████████████████████▊   | 116/122 [10:02<00:57,  9.63s/it]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  97%|█████████████████████████████████████████████████████████████▉  | 118/122 [10:18<00:34,  8.51s/it]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 122/122 [10:51<00:00,  5.34s/it]
