# Setup
1) need to download modules included below
2) install ImagerMagick following the instrution in website of wand.
3) install google.cloud.vision -> configurate your credentials and private key for connection

# NOTE: similar code as "Hand-written_OCR" using Google Vision API

API reference: https://cloud.google.com/python/docs/reference/vision/latest

Steps:
1) convert PDF to JPG
2) call GV API to read data
3) output data: i) folder same naming as PDF file, ii) pages stored saparately in the folder

In [1]:
# for convert PDF to JPG
from wand.image import Image as WandImage
from PyPDF2 import PdfReader
# ML based OCR 
import pytesseract  
from PIL import Image
# for progress bar
from tqdm import tqdm
# other
import os

In [2]:
# first, output all files under "./pdf/STD/" folder
def list_files(directory: str):
    """
    List up all files under a directory
    
    Input:
        directory (str): directory that stores all files to be listed up
    """
    file_list = []
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(directory):
        # Iterate over each file in the current directory
        for file in files:
            # Get the file names
            #file_path = os.path.join(root, file)  #do not need root path
            file_path = file
            # Append the file path to the list
            file_list.append(file_path)
    return file_list

In [3]:
# define functions to convert PDF to JPG
def pdf_to_images(pdf_path: str, output_folder: str, total_pages: int = None):
    """
    Converting PDF to JPG using WandImage (need to pre-install)
    
    Input: 
        pdf_path (str): path that stores all PDF
        output_folder (str): folder to store outputs
        total_pages (int): number of pages to be proceeded. Default is None, means all page)
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)

        # Change to the total number of pages in your PDF
        #total_pages = 3
        if total_pages is None:
            total_pages = len(pdf_reader.pages)
        for page_number in tqdm(range(0, total_pages), desc="Processing Pages"):
            # Convert each page to an image
            with WandImage(filename=f'{pdf_path}[{page_number}]', resolution=400) as img:
                # resize the image to ensure high resolution: also resol=300
                img.resize(width=2 * img.width, height=2 * img.height)
                # Save the image as JPG in the output folder
                img.save(filename=os.path.join(output_folder, f'page_{page_number+1}.jpg'))
        print(f"Total page is {total_pages}, so exit the program.")
    return

def OCR_image(image_path: str, config: str = None):
    """
    Using pytesseract to OCR image (testing)
    
    Input: 
        image_path (str): path that stores image
        config (str): configuration for pytesseract
    """
    # OCR on the image
    image = Image.open(image_path)
    if config is not None:
        text = pytesseract.image_to_string(image, lang='jpn', config=config)
    else:
        # else default
        text = pytesseract.image_to_string(image, lang='jpn')
    #print(f"Text:\n{text}\n")
    return text

# Functions using Google Vision API

In [4]:
# import google libraries
from google.cloud import vision
import pandas as pd
import os
from tqdm import tqdm

In [5]:
# functions for reading images, returning information in the images 
def detect_document(path: str, lanHint: str ='ja'):
    """
    Detects document features in an image.
    
    Input:
        path (str): document path
    
    Output:
        image: input image read by vision API
        response: Google Vision API response for further processing
    """
    client = vision.ImageAnnotatorClient()
    with open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    # add crop hint & language hint (ja) -> looks better
    crop_hints_params = vision.CropHintsParams(aspect_ratios=[0.8, 1.0, 1.2])
    image_context = vision.ImageContext(crop_hints_params=crop_hints_params, language_hints=[lanHint])
    #image_context = vision.ImageContext(crop_hints_params=crop_hints_params) 
    # if not lan hint, results will be strange
    # note: language_hints NOT languageHints becoz it is RPC API (https://github.com/googleapis/google-cloud-python/issues/6387)
    response = client.document_text_detection(image=image,image_context=image_context)
    #response = client.text_detection(image=image,image_context=image_context)  # use text_detection
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return image,response


def print_response(response):
    """
    Print contents in vision API response to check
    
    Input:
        response: vision API response
    """
    for page in response.full_text_annotation.pages:
        print(f"Current page has {len(page.blocks)} blocks")
        nBlock = 1
        for block in page.blocks:
            block_text = []
            print("####################################")
            print(f'Current block: {nBlock}')
            print(f"\nBlock confidence: {block.confidence}\n")
            for paragraph in block.paragraphs:
                paragraph_text = []
                print("Paragraph confidence: {}".format(paragraph.confidence))
                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    paragraph_text.append(word_text)
                block_text = ''.join(paragraph_text)
                print(block_text)
                result_str = ''.join(block_text)
                print(result_str)
            nBlock += 1
    return

def print_full_text(response):
    """
    Print full content of the response
    
    Input:
        response: vision API response
    """
    print(f"Full Text: {response.full_text_annotation.text}")
    return

# check encoding of the result -> if not Shift-JIS, return False
def is_not_encoding(string: str, check: str = 'cp932'):
    """
    To check if the input string is not Shift_JIS
    
    Input:
        string (str): string to be checked
        check (str): encoding code page (cp932 is Japanese)
    
    Output:
        boolean
    """
    try:
        # Try to decode the string using 'cp932' encoding
        decoded_string = string.encode(check)
    except UnicodeEncodeError:
        # If encoding raises an error, the string is not 'cp932'
        return True
    else:
        # If encoding is successful, the string is 'cp932'
        return False

def save_full_text(path: str, fn1: str, fn2: str, response):
    """
    Save block by block in vision API response
    
    Input:
        path (str): path to save file
        fn1 (str): filename for storing recognized text
        fn2 (str): filename for storing confidence of recognized text
    """
    # Open the file in write mode
    with open(path+fn1, "w") as file:
        # Write each element of the list to a new line in the file
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_text = []
                for paragraph in block.paragraphs:
                    paragraph_text = []
                    for word in paragraph.words:
                        word_text = "".join([symbol.text for symbol in word.symbols])
                        paragraph_text.append(word_text)
                    block_text = ''.join(paragraph_text)
                    result_str = ''.join(block_text)
                    # testing
                    #print(result_str)
                    if is_not_encoding(string=result_str):
                        print(f"Wrong encoding: {result_str}")
                    else:
                        file.write(f"{result_str}\n")
                
    with open(path+fn2,"w") as file:
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_confidence = []
                block_confidence.append(block.confidence)
                for paragraph in block.paragraphs:
                    paragraph_confidence = []
                    paragraph_confidence.append(paragraph.confidence)
                file.write(f"{block_confidence}\n")
    #print(f"List contents saved to {path+fn1}")
    #print(f"List confidence saved to {path+fn2}")
    
# read all image and save txt (page_x.txt && page_x_confidence.txt) to directory
def save_all_full_text(directory_path: str, save_path: str):
    """
    Processing all JPG in directory, call vision API, and then save data.
    
    Input: 
        directory_path (str): path that lists all JPGs
        save_path (str): path that stores txt
    """
    # Loop over all image files in the directory
    for filename in tqdm(os.listdir(directory_path),desc='Processing jpg: '):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            # set up image path
            image_path = os.path.join(directory_path, filename)
            # detect OCR using vision API
            try:
                image,response = detect_document(image_path)
            except:
                print(f"An execution error encountered at {filename}")
            # then save all OCR text to .txt file with confidence -> remove .jpg from filename
            savename = filename.replace('.jpg','')
            save_full_text(path=save_path,fn1=f'{savename}.txt',fn2=f'{savename}_confidence.txt',response=response)
            # testing
            #print_response(response=response)
    return

In [6]:
# once the key is set up and downloaded, config the environmental variables
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:/service_account_PubKey.json'

In [7]:
# config paths
input_path = './pdf/STD/'
output_path = './jpg/STD/'

In [8]:
# show all STD pdf files
STD_files = list_files(directory=input_path)
print(f"Files of STD: {STD_files}")
print(f"Length: {len(STD_files)}")

Files of STD: ['H19_1月~3月.pdf', 'H19_4月.pdf', 'H19_5月.pdf', 'H19_6月~7月.pdf', 'H19_8月(2).pdf', 'H_198月.pdf']
Length: 6


# Total of 6 folders to process: 1) to JPG, 2) recognise text

Proceessing PDF (1/6) (note: do not use loop for readibility)

In [9]:
# set paths: save txt file to same folder as jpg
fn_0 = STD_files[0]
folder_0 = fn_0.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_0}'
jpg_path = f'./jpg/STD/{folder_0}/'

In [10]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [13:16<00:00, 10.21s/it]

Total page is 78, so exit the program.





In [11]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:10<04:13,  1.13s/it]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:12<03:47,  1.03s/it]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:15<03:42,  1.02s/it]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:38<02:52,  1.11it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:45<02:35,  1.18it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:56<01:47,  1.56it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:25<01:28,  1.51it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:30<01:43,  1.25it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:39<01:36,  1.21it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:42<01:33,  1.22it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:52<01:19,  1.27it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:55<01:37,  1.01it/s]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [02:00<01:26,  1.07it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [02:05<01:13,  1.18it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:21<00:52,  1.30it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:25<00:55,  1.13it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:37<00:39,  1.21it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:40<00:39,  1.15it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:43<00:35,  1.17it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:50<00:33,  1.02s/it]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [03:04<00:17,  1.03it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [03:08<00:09,  1.25it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:13<00:00,  1.21it/s]


Processing PDF (2/6)

In [12]:
# set paths: save txt file to same folder as jpg
fn_1 = STD_files[0]
folder_1 = fn_1.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_1}'
jpg_path = f'./jpg/STD/{folder_1}/'

In [13]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [12:34<00:00,  9.68s/it]

Total page is 78, so exit the program.





In [14]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:09<03:55,  1.05s/it]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:11<03:36,  1.03it/s]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:14<03:21,  1.09it/s]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:36<02:52,  1.11it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:43<02:33,  1.19it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:53<01:44,  1.59it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:17<01:30,  1.48it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:21<01:38,  1.32it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:29<01:23,  1.40it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:33<01:33,  1.22it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:42<01:14,  1.35it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:45<01:32,  1.07it/s]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [01:50<01:19,  1.18it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [01:55<01:12,  1.20it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:09<00:48,  1.41it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:13<00:49,  1.27it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:24<00:36,  1.33it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:27<00:35,  1.27it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:30<00:36,  1.15it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:38<00:30,  1.07it/s]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [02:52<00:16,  1.10it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [02:56<00:09,  1.21it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:04<00:00,  1.28it/s]


Processing PDF (3/6)

In [15]:
# set paths: save txt file to same folder as jpg
fn_2 = STD_files[0]
folder_2 = fn_2.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_2}'
jpg_path = f'./jpg/STD/{folder_2}/'

In [16]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [16:14<00:00, 12.50s/it]

Total page is 78, so exit the program.





In [17]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:08<03:35,  1.05it/s]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:11<03:17,  1.12it/s]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:13<03:01,  1.21it/s]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:33<02:31,  1.27it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:39<02:07,  1.44it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:48<01:30,  1.85it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:15<01:34,  1.42it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:19<01:49,  1.18it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:30<01:46,  1.10it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:33<01:46,  1.07it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:43<01:19,  1.28it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:45<01:22,  1.19it/s]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [01:49<01:11,  1.31it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [01:53<01:02,  1.39it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:06<00:44,  1.52it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:11<00:49,  1.28it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:23<00:38,  1.24it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:26<00:38,  1.18it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:29<00:35,  1.17it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:37<00:33,  1.01s/it]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [02:50<00:16,  1.08it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [02:54<00:09,  1.21it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:02<00:00,  1.29it/s]


Processing PDF (4/6)

In [18]:
# set paths: save txt file to same folder as jpg
fn_3 = STD_files[0]
folder_3 = fn_3.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_3}'
jpg_path = f'./jpg/STD/{folder_3}/'

In [19]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [18:46<00:00, 14.44s/it]

Total page is 78, so exit the program.





In [20]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:09<03:40,  1.02it/s]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:11<03:30,  1.06it/s]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:14<03:14,  1.13it/s]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:36<02:48,  1.14it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:43<02:31,  1.20it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:53<01:45,  1.59it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:22<01:55,  1.16it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:27<02:00,  1.07it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:36<01:38,  1.19it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:40<01:45,  1.08it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:51<01:26,  1.16it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:54<01:48,  1.09s/it]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [01:59<01:25,  1.09it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [02:03<01:11,  1.22it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:17<00:47,  1.44it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:23<00:56,  1.12it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:37<00:42,  1.14it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:39<00:38,  1.17it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:41<00:33,  1.24it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:50<00:32,  1.03it/s]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [03:03<00:16,  1.11it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [03:08<00:10,  1.12it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:15<00:00,  1.20it/s]


Processing PDF (5/6)

In [21]:
# set paths: save txt file to same folder as jpg
fn_4 = STD_files[0]
folder_4 = fn_4.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_4}'
jpg_path = f'./jpg/STD/{folder_4}/'

In [22]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [17:17<00:00, 13.30s/it]

Total page is 78, so exit the program.





In [23]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:09<03:21,  1.11it/s]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:11<03:21,  1.10it/s]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:14<03:11,  1.15it/s]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:37<02:41,  1.19it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:42<01:59,  1.53it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:53<01:45,  1.59it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:18<01:34,  1.42it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:24<02:01,  1.06it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:31<01:25,  1.37it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:34<01:26,  1.31it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:43<01:10,  1.44it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:45<01:16,  1.30it/s]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [01:50<01:12,  1.28it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [01:54<01:03,  1.36it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:09<00:50,  1.34it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:14<00:56,  1.11it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:24<00:36,  1.33it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:26<00:31,  1.45it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:29<00:31,  1.34it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:35<00:26,  1.25it/s]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [02:48<00:14,  1.22it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [02:52<00:09,  1.25it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:00<00:00,  1.30it/s]


Processing PDF (6/6)

In [24]:
# set paths: save txt file to same folder as jpg
fn_5 = STD_files[0]
folder_5 = fn_5.replace(".pdf","")
pdf_path = f'./pdf/STD/{fn_5}'
jpg_path = f'./jpg/STD/{folder_5}/'

In [25]:
# convert PDF to JPG
pdf_to_images(pdf_path=pdf_path, output_folder=jpg_path, total_pages=None)

Processing Pages: 100%|████████████████████████████████████████████████████████████████| 78/78 [16:36<00:00, 12.78s/it]

Total page is 78, so exit the program.





In [26]:
# OCR all images and return txt files: processing all fn_0
save_all_full_text(directory_path=jpg_path,save_path=jpg_path)

Processing jpg:   4%|██▊                                                              | 10/235 [00:08<03:17,  1.14it/s]

Wrong encoding: LỌLỘLỌLOLOLO101010


Processing jpg:   6%|███▌                                                             | 13/235 [00:11<03:18,  1.12it/s]

Wrong encoding: CÓ0070GIACỦNH


Processing jpg:   7%|████▍                                                            | 16/235 [00:13<02:57,  1.23it/s]

Wrong encoding: LÒLỒLỘLàLỘLaLỒLỘLỘ


Processing jpg:  18%|███████████▉                                                     | 43/235 [00:35<02:39,  1.21it/s]

Wrong encoding: CÓCÓ10ĐICANH


Processing jpg:  22%|██████████████▍                                                  | 52/235 [00:42<02:16,  1.34it/s]

Wrong encoding: LỌLỘLỌLO-345


Processing jpg:  29%|██████████████████▊                                              | 68/235 [00:51<01:35,  1.75it/s]

An execution error encountered at page_3.jpg


Processing jpg:  43%|███████████████████████████▌                                    | 101/235 [01:18<01:34,  1.42it/s]

Wrong encoding: LỒLỒLỘLỒLỒLÀLỘLỒLỘ


Processing jpg:  45%|████████████████████████████▊                                   | 106/235 [01:24<02:02,  1.05it/s]

Wrong encoding: HƯXẺH*I]:2007/02/1311:24:14測定地点:8M


Processing jpg:  50%|████████████████████████████████▏                               | 118/235 [01:34<01:42,  1.14it/s]

Wrong encoding: •


Processing jpg:  51%|████████████████████████████████▉                               | 121/235 [01:37<01:45,  1.08it/s]

Wrong encoding: HẠOVLỌCÓECÓ


Processing jpg:  57%|████████████████████████████████████▍                           | 134/235 [01:47<01:22,  1.23it/s]

Wrong encoding: H]RẺH*I]:2007/03/2810:14:45測定地点:8M


Processing jpg:  58%|█████████████████████████████████████                           | 136/235 [01:49<01:27,  1.13it/s]

Wrong encoding: LỄ,L,LỘL,L,Là


Processing jpg:  60%|██████████████████████████████████████▋                         | 142/235 [01:55<01:22,  1.13it/s]

Wrong encoding: LLồLỒLỒLÀL,L,L


Processing jpg:  63%|████████████████████████████████████████▎                       | 148/235 [01:59<01:13,  1.19it/s]

Wrong encoding: LỘLỘLỘLỘLỘLỘLỘLÀ23456789


Processing jpg:  71%|█████████████████████████████████████████████▍                  | 167/235 [02:12<00:45,  1.51it/s]

Wrong encoding: LOLOLỌLỘLỌLỘLỌLOLO


Processing jpg:  73%|██████████████████████████████████████████████▊                 | 172/235 [02:17<00:52,  1.21it/s]

Wrong encoding: 測定時刻:2007/01/2213:34:41NÍÞÁ:9M
Wrong encoding: LỘLỘLOLỒLLỒLàLàCO


Processing jpg:  80%|██████████████████████████████████████████████████▉             | 187/235 [02:31<00:41,  1.15it/s]

Wrong encoding: 6760LÀ


Processing jpg:  81%|███████████████████████████████████████████████████▋            | 190/235 [02:33<00:37,  1.20it/s]

Wrong encoding: LỒLỘLỘLLỒLÀLỒLỘLỘ


Processing jpg:  82%|████████████████████████████████████████████████████▌           | 193/235 [02:36<00:34,  1.22it/s]

Wrong encoding: 測定時刻:2007/01/16NËŁA:8M


Processing jpg:  86%|███████████████████████████████████████████████████████         | 202/235 [02:43<00:29,  1.11it/s]

Wrong encoding: □N3+10201∞@OHNMLOCOMØ


Processing jpg:  92%|███████████████████████████████████████████████████████████     | 217/235 [02:57<00:16,  1.09it/s]

Wrong encoding: 測定時刻:2007/01/1110:31:14HURẺPHIẢ:8M
Wrong encoding: LỌLỘLỌLLỌLOLO


Processing jpg:  95%|████████████████████████████████████████████████████████████▋   | 223/235 [03:03<00:11,  1.05it/s]

Wrong encoding: HƯRẺU*I]:2007/01/1010:36:14測定地点:8M


Processing jpg: 100%|████████████████████████████████████████████████████████████████| 235/235 [03:09<00:00,  1.24it/s]


In [27]:
# testing page_22, page_23
#testing_path = f'./jpg/STD/testing/'
#save_all_full_text(directory_path=testing_path,save_path=testing_path)

End of code