# Create PDF Image to Searchable PDF using Textract and MuPDF 

Install MuPDF Library

In [None]:
! pip install PyMuPDF --upgrade

Install Textract convenience tools from TRP

In [None]:
!pip install amazon-textract-caller

Replace \<YOUR_BUCKET_NAME\> with your bucket name <BR>
Upload your pdf and put the filename in \<YOUR INPUT PDF FILENAME\> <BR>
Set your output filename in \<YOUR OUTPUT PDF FILENAME\>

In [None]:
bucket_uri = "<YOUR_BUCKET_NAME?>/temp"
filename = "<YOUR INPUT PDF FILENAME>"
output_name = "<YOUR OUTPUT PDF FILENAME>"

s3_uri = f"s3://{bucket_uri}/{filename}"

Copy local PDF to S3

In [None]:
! aws s3 cp $filename $s3_uri

In [None]:
import json
import trp
from trp import Document
from textractcaller import call_textract

# Using convenience library, call textract on the PDF
textract_json = call_textract(input_document=s3_uri)
textract_doc = trp.Document(textract_json)

In [None]:
import fitz
import re, string

# Open the PDF in MuPDF
doc = fitz.open(filename)

curpage = 0
draw_box = False
redbox_color = (1, 0, 0)

# Iterate through each page in mupdf and the corresponding page from textract response

for page in doc:
    # Get current textract page info
    textract_page = textract_doc.pages[curpage]
    # Get page width and height from mupdf
    page_width = page.rect.width
    page_height = page.rect.height
    # Iterate through each line of text from Textract
    for line in textract_page.lines:
        # Get bounding box for the line
        line_geo = line.geometry.boundingBox
        # Calculate coordinates on page using the height/width  
        p1_x = int(page_width * line_geo.left)
        p1_y = int(page_height * line_geo.top)
        p2_x = int((line_geo.left + line_geo.width) * page_width)
        p2_y = int((line_geo.top + line_geo.height) * page_height)
        # Create a rectangle for text population
        WHERE = fitz.Rect(p1_x, p1_y, p2_x, p2_y)
        #print(f"Inserting text {line.text} {p1_x},{p1_y} {p2_x},{p2_y}")
        # Reset origin for page
        if not page.is_wrapped:
            page.wrap_contents()
        # Draw a rectangle for the bounding box
        if draw_box:
            page.draw_rect(WHERE,color=redbox_color)
        # Start with large font size of 300 keep attempting to insert text into textbox 
        # with smaller font sizes until one fits entirely i.e. non negative result
        font_size = 300
        # Attempt to write invisible text i.e. render_mode=3
        res = page.insert_textbox(WHERE, line.text, fontsize=font_size, render_mode=3)
        while res<0:
            font_size -= 1
            res = page.insert_textbox(WHERE, line.text, fontsize=font_size, render_mode=3)
    # Increase page number
    curpage += 1
# Save modified PDF
doc.save(output_name)