In [1]:
import cv2
from pdf2image import convert_from_path
from pathlib import Path
import boto3
import numpy as np

In [2]:
# Minimum Width and Height of a visual. These are expressed in percentage of page width or page height. 
# This can be as small as a width of a character, then you will treat all characters as an visual. 
# The optimum threshold depends on your use case. You can play around these these parameters to find the optimum point.
MINIMUM_WIDTH = 0.05        
MINIMUM_HEIGTH = 0.05

# Image padding parameters (in pixels): Set padding values to bring the text associated to your image,
# This is particular useful with visuals/charts without boarders, and you like to consider the axis values, 
# image title, or image descriptions as part of the visual.
LEFT_PADDING = 15
RIGHT_PADDING = 5
TOP_PADDING = 5
BOTTOM_PADDING = 65

In [3]:
# Input and Output Files/Directories:
input_file_location = "./doc_input/"
output_file_location = "./doc_output/"
input_file_name = "test_long.pdf"

In [4]:
doc = convert_from_path(input_file_location + input_file_name)

In [5]:
page_start = 2

#create a Textract Client
textract = boto3.client('textract')  

In [7]:
results_per_document = []
for ind, page in enumerate(doc[page_start:]):
    original_img = cv2.cvtColor(np.asarray(page), code=cv2.COLOR_RGB2BGR)
    gray_scale_image = cv2.cvtColor(original_img, cv2.COLOR_BGR2GRAY)  

    # Apply a Canny Edge detector to the gray scale page and detect edges in the gray scale page
    canny_img = cv2.Canny(gray_scale_image, 0, 255, apertureSize=3, L2gradient=True)  

    #Detect the image contours in the edges detected
    contours, hierarchy = cv2.findContours(canny_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    #Set the minimun dimensions for the images inside the page
    minimum_width = int(original_img.shape[1] * MINIMUM_WIDTH)
    minimum_height = int(original_img.shape[0] * MINIMUM_HEIGTH)

    #Iterate over the contours detected
    image_index = 0
    save_images = False 

    for c in contours:        
        #Get the contour corner (x,y) width and height
        x,y,w,h = cv2.boundingRect(c)

        #Verify if the contour dimensions match the minimun dimensions set with minimum_width and minimum_height
        if (w >= minimum_width and h >= minimum_height):
            if (save_images):
                image_index += 1
                #Crop the image inside the contour and save it to a file
                cropped_image = original_img[y - TOP_PADDING:(y + h + BOTTOM_PADDING), x - LEFT_PADDING:(x + w + RIGHT_PADDING)]
                cv2.imwrite(fr"{output_file_location}{Path(input_file_name).stem}_page_{page_number + 1}_image_{image_index}.png",cropped_image)
                cropped_image = None

            #Redact the image inside the page document
            cv2.rectangle(original_img, (x - LEFT_PADDING, y - TOP_PADDING), (x + w + RIGHT_PADDING, y + h + BOTTOM_PADDING), (255, 255, 255), -1)   

    img_encode = cv2.imencode('.png', original_img)[1]
    data_encode = np.array(img_encode)
    img_bytes = data_encode.tobytes()

    
    # Analyze Document Page using Textract (OCR)
    results_per_document.append(textract.detect_document_text(Document={'Bytes': img_bytes}))

In [10]:
results_per_document[:10]

[{'DocumentMetadata': {'Pages': 1},
  'Blocks': [{'BlockType': 'PAGE',
    'Geometry': {'BoundingBox': {'Width': 1.0,
      'Height': 0.9997862577438354,
      'Left': 0.0,
      'Top': 0.0},
     'Polygon': [{'X': 1.7318312555086277e-16, 'Y': 0.0},
      {'X': 1.0, 'Y': 0.0},
      {'X': 1.0, 'Y': 0.9997862577438354},
      {'X': 0.0, 'Y': 0.9997862577438354}]},
    'Id': '45a88f3e-21bb-4819-ad43-6d181854db3c',
    'Relationships': [{'Type': 'CHILD',
      'Ids': ['4670f8fd-8c15-4b41-a1d7-ad2bd4d4a7f2',
       '53141b20-7e61-432f-81b7-b08e2f6b9874',
       '3beef222-be40-43f3-943a-a8e50274fd82',
       'ffca9dbb-178a-4d84-8148-92bf4d371250',
       '3400ad61-627f-4b4c-a727-2a762f079fca',
       '68bda455-6fe5-4eeb-980a-9d26dffcb7ac',
       'fbe54eb5-004e-4d36-9f91-0a5d163c27f0',
       '8deca5e6-d359-4e9a-9b34-ea9870712c1e',
       '01cae4a8-ac16-4acd-91a0-f5be457817fb',
       '51cbab0f-5f53-43c0-a208-500a46dc81ac',
       '6e589afa-938f-455d-bee7-affcd5620884',
       'b1c0ad6f-eea