# Setup
1) need to download modules included below
2) install ImagerMagick following the instrution in website of wand.
3) install google.cloud.vision -> configurate your credentials and private key for connection

In [27]:
# for convert PDF to JPG
from wand.image import Image as WandImage
from PyPDF2 import PdfReader
# ML based OCR 
import pytesseract  
from PIL import Image
# for progress bar
from tqdm import tqdm
# other
import os

In [4]:
# config paths
input_path = './pdf/'
output_path = './jpg/'
fn = "平成30年ー平成31年度船舶使用願.pdf"

In [25]:
def pdfToImages(pdf_path, output_folder, total_pages=None):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PdfReader(pdf_file)

        # Change to the total number of pages in your PDF
        #total_pages = 3
        if total_pages is None:
            total_pages = len(pdf_reader.pages)
        for page_number in tqdm(range(0, total_pages), desc="Processing Pages"):
            # Convert each page to an image
            with WandImage(filename=f'{pdf_path}[{page_number}]', resolution=400) as img:
                # resize the image to ensure high resolution: also resol=300
                img.resize(width=2 * img.width, height=2 * img.height)
                # Save the image as JPG in the output folder
                img.save(filename=os.path.join(output_folder, f'page_{page_number+1}.jpg'))
        print(f"Total page is {total_pages}, so exit the program.")
    return

def OCRImage(image_path, config=None):
    # OCR on the image
    image = Image.open(image_path)
    if config is not None:
        text = pytesseract.image_to_string(image, lang='jpn', config=config)
    else:
        # else default
        text = pytesseract.image_to_string(image, lang='jpn')
    #print(f"Text:\n{text}\n")
    return text

In [6]:
# reading pdf and converting to jpg: testing 3 cases first
pdfToImages(pdf_path=input_path+fn, output_folder=output_path, total_pages=3)

Processing Pages: 100%|██████████| 3/3 [00:20<00:00,  6.67s/it]

Total page is 3, so exit the program.





In [7]:
# reading pdf and converting to jpg: try all pages -> takes >30 mins
pdfToImages(pdf_path=input_path+fn, output_folder=output_path, total_pages=None)

Processing Pages: 100%|██████████| 228/228 [32:17<00:00,  8.50s/it] 

Total page is 228, so exit the program.





In [23]:
# read text from img: testing page 1 which is PC typed
#text = OCRImage(image_path='./jpg/page_1.jpg')
# note: make sure jpn.traineddata is downloaded and placed to Tesseract-OCR/tessdata/ folder.
#    -  jpn.traineddata: it is downloaded from GitHub as a pre-trained JPN data (users can train their own dataset).
# note: TESSDATA_PREFIX should direct to location of tessdata (just in case)
###############################################################################
#print(text)

In [6]:
# read text from img: testing page 2 & 3 which is hand written
#text = OCRImage(image_path='./jpg/page_2.jpg', config='--psm 11 --oem 3')
# note: changing resolution (300 -> 400) can affect the results
# --psm N: N from 0 to 13, psm configurates structure of img
# --oem N: N from 0 to 3, oem congifurates OCR engine mode
###############################################################################
#print(text)

In [7]:
# read text from img: testing page 2 & 3 which is hand written
#text = OCRImage(image_path='./jpg/page_3.jpg')
###############################################################################
#print(text)

# Use google cloud vision API below

In [12]:
# import google libraries
from google.cloud import vision
import pandas as pd
import os
from tqdm import tqdm

In [18]:
# sample code 
def detect_document(path):
    """Detects document features in an image."""
    client = vision.ImageAnnotatorClient()
    with open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.document_text_detection(image=image)
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return image,response


def print_response(response):
    for page in response.full_text_annotation.pages:
        print(f"Current page has {len(page.blocks)} blocks")
        nBlock = 1
        for block in page.blocks:
            block_text = []
            print("####################################")
            print(f'Current block: {nBlock}')
            print(f"\nBlock confidence: {block.confidence}\n")
            for paragraph in block.paragraphs:
                paragraph_text = []
                print("Paragraph confidence: {}".format(paragraph.confidence))
                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    paragraph_text.append(word_text)
                block_text = ''.join(paragraph_text)
                print(block_text)
                result_str = ''.join(block_text)
                print(result_str)
            nBlock += 1
    return

def print_full_text(response):
    print(f"Full Text: {response.full_text_annotation.text}")
    return

def save_full_text(path, fn1, fn2, response):
    # Open the file in write mode
    with open(path+fn1, "w") as file:
        # Write each element of the list to a new line in the file
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_text = []
                for paragraph in block.paragraphs:
                    paragraph_text = []
                    for word in paragraph.words:
                        word_text = "".join([symbol.text for symbol in word.symbols])
                        paragraph_text.append(word_text)
                    block_text = ''.join(paragraph_text)
                    result_str = ''.join(block_text)
                file.write(f"{result_str}\n")
                
    with open(path+fn2,"w") as file:
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                block_confidence = []
                block_confidence.append(block.confidence)
                for paragraph in block.paragraphs:
                    paragraph_confidence = []
                    paragraph_confidence.append(paragraph.confidence)
                file.write(f"{block_confidence}\n")
    #print(f"List contents saved to {path+fn1}")
    #print(f"List confidence saved to {path+fn2}")
    
# read all image and save txt (page_x.txt && page_x_confidence.txt) to directory
def save_all_full_text(directory_path,save_path):
    # Loop over all image files in the directory
    for filename in tqdm(os.listdir(directory_path),desc='Processing jpg: '):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            # set up image path
            image_path = os.path.join(directory_path, filename)
            # detect OCR using vision API
            image,response = detect_document(image_path)
            # then save all OCR text to .txt file with confidence -> remove .jpg from filename
            savename = filename.replace('.jpg','')
            save_full_text(path=save_path,fn1=f'{savename}.txt',fn2=f'{savename}_confidence.txt',response=response)
    return

In [19]:
# once the key is set up and downloaded, config the environmental variables
#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:/service_account_PubKey.json'
# set paths: testing with 5 cases
jpg_path = './jpg/'
testing_path = './jpg/testing/'
H30_H31_path = './jpg/平成30年ー平成31年度船舶使用願/'

In [20]:
# testing one: page 2 handwritten
image, response = detect_document(path=testing_path+'page_2.jpg')
#print_response(response=response)
#print_full_text(response=response)
save_full_text(path=testing_path,fn1='page_2.txt',fn2='page_2_confidence.txt',response=response)

In [21]:
# OCR all images and return txt files: processing testing files
save_all_full_text(directory_path=testing_path,save_path=testing_path)

Processing jpg: 100%|██████████| 15/15 [00:09<00:00,  1.56it/s]


In [22]:
# OCR all images and return txt files: processing all for H30-H31 files
save_all_full_text(directory_path=H30_H31_path,save_path=H30_H31_path)

Processing jpg: 100%|██████████| 362/362 [07:26<00:00,  1.23s/it]


In [28]:
############ Convert 令和元年 #################
R1_pdf_path = './pdf/令和元年船舶使用願.pdf'
R1_jpg_path = './jpg/令和元年船舶使用願/'
pdfToImages(pdf_path=R1_pdf_path,output_folder=R1_jpg_path)

Processing Pages:   0%|          | 0/171 [00:00<?, ?it/s]Processing Pages: 100%|██████████| 171/171 [22:06<00:00,  7.76s/it]

Total page is 171, so exit the program.





In [29]:
# OCR all images and return txt files: processing all for R1 files
save_all_full_text(directory_path=R1_jpg_path,save_path=R1_jpg_path)

Processing jpg: 100%|██████████| 171/171 [05:45<00:00,  2.02s/it]


In [30]:
############ Convert 令和2年 #################
R2_pdf_path = './pdf/令和2年度船舶使用願.pdf'
R2_jpg_path = './jpg/令和2年度船舶使用願/'
pdfToImages(pdf_path=R2_pdf_path,output_folder=R2_jpg_path)

Processing Pages: 100%|██████████| 166/166 [17:32<00:00,  6.34s/it]

Total page is 166, so exit the program.





In [31]:
# OCR all images and return txt files: processing all for R2 files
save_all_full_text(directory_path=R2_jpg_path,save_path=R2_jpg_path)

Processing jpg: 100%|██████████| 166/166 [05:29<00:00,  1.98s/it]


In [32]:
############ Convert 令和3年 #################
R3_pdf_path = './pdf/令和3年度船舶使用願.pdf'
R3_jpg_path = './jpg/令和3年度船舶使用願/'
pdfToImages(pdf_path=R3_pdf_path,output_folder=R3_jpg_path)

Processing Pages: 100%|██████████| 168/168 [16:47<00:00,  6.00s/it]

Total page is 168, so exit the program.





In [33]:
# OCR all images and return txt files: processing all for R3 files
save_all_full_text(directory_path=R3_jpg_path,save_path=R3_jpg_path)

Processing jpg: 100%|██████████| 168/168 [05:22<00:00,  1.92s/it]


In [34]:
############ Convert 令和4年 #################
R4_pdf_path = './pdf/令和4年度船舶使用願.pdf'
R4_jpg_path = './jpg/令和4年度船舶使用願/'
pdfToImages(pdf_path=R4_pdf_path,output_folder=R4_jpg_path)

Processing Pages: 100%|██████████| 143/143 [11:27<00:00,  4.81s/it]

Total page is 143, so exit the program.





In [35]:
# OCR all images and return txt files: processing all for R4 files
save_all_full_text(directory_path=R4_jpg_path,save_path=R4_jpg_path)

Processing jpg: 100%|██████████| 143/143 [04:28<00:00,  1.88s/it]


# TODO: change to txt I/O and reorganize in below

In [36]:
def process_images_in_directory(directory_path, output_text_excel_path, output_confidence_excel_path):
    # Create empty DataFrames to store text and confidence results
    text_df = pd.DataFrame()
    confidence_df = pd.DataFrame()

    # Loop over all image files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(directory_path, filename)
            print(f"Processing image: {image_path}")

            # Detect document features in the image
            try:
                image,image_results = detect_document(image_path)
            except Exception as e:
                print(f"Error processing image {image_path}: {str(e)}")
                continue

            # Extract information from the results and add to the DataFrames
            text_list = []
            confidence_list = []
            row_final = []

            for page in image_results.full_text_annotation.pages:
                for block in page.blocks:
                    for paragraph in block.paragraphs:
                        paragraph_text = []
                        #print("Paragraph confidence: {}".format(paragraph.confidence))
                        for word in paragraph.words:
                            word_text = "".join([symbol.text for symbol in word.symbols])
                            paragraph_text.append(word_text)
                        confidence_list.append(paragraph.confidence)
                        result_str = ''.join(paragraph_text)
                        
                        if "所属" in result_str:
                            print(result_str)
                        check_append_content(content=result_str, check="平成",output=row_final)
                        check_append_content(content=result_str, check="グランメーユ",output=row_final)
                        check_append_content(content=result_str, check="所属",output=row_final)
                        #check_append_content(content=result_str, check="所属",output=row_final)
                        #check_append_content(content=result_str, check="所属",output=row_final)
                        #check_append_content(content=result_str, check="所属",output=row_final)
                        text_list.append(result_str)

            #text_df[filename] = text_list
            #confidence_df[filename] = confidence_list
            #print(row_final)

    # Save the DataFrames to Excel files
    #text_df.to_excel(output_text_excel_path, index=False)
    #confidence_df.to_excel(output_confidence_excel_path, index=False)

    print(f"Text results saved to {output_text_excel_path}")
    print(f"Confidence results saved to {output_confidence_excel_path}")
    return

def check_append_content(content, check, output, check2=None):
    if check in content:
        return output.append(content)
    else:
        return None

In [None]:
process_images_in_directory(directory_path=jpg_path, output_text_excel_path="./", output_confidence_excel_path="./")