In [1]:
from PyPDF2 import PdfWriter, PdfReader
from pathlib import Path
from decimal import Decimal
import json
import os
import math
import fitz as pymupdf

# The path to the PDF file
pdf_path = Path("./documents/document_6.pdf")
# The path of the JSON file generated by AWS
json_path = Path("./aws/aws_output_6.json")

ModuleNotFoundError: No module named 'fitz'

In [2]:
class Box():
    def __init__(self,width,height,left,top,page_width,page_height):
        self.width = float(width)
        self.height = float(height)
        self.left = float(left)
        self.top = float(top)
        self.page_width = float(page_width)
        self.page_height = float(page_height)
        
        # In AWS textract, width,height,left,top are in %. They represent the % of the dimensions of the whole page.
        self.left = self.page_width * self.left
        self.top = self.page_height * self.top
        self.width = self.page_width * self.width
        self.height = self.page_height * self.height
        
        # Coordinates when the origin is the BOTTOM left
        self.lowerLeft = (int(self.left), int(self.page_height-self.top-self.height))
        self.upperRight = (int(self.left + self.width), int(self.page_height-self.top))
        
        # Coordinates when the origin is the TOP left
        self.x_tl = int(self.left) # X Top left
        self.y_tl = int(self.top) # Y Top left
        self.x_br = int(self.left + self.width) # X Bottom Right
        self.y_br = int(self.top + self.height) # Y Bottom Right
        self.x_bl = int(self.left) # X Bottom Right
        self.y_bl = int(self.top + self.height) # Y Bottom Right
        
        # self.lowerLeft = (int(math.ceil(self.left)), int(math.ceil(self.page_height-self.top-self.height)))
        # self.upperRight = (int(math.ceil(self.left + self.width)), int(math.ceil(self.page_height-self.top)))
        
    def top_midpoint(self):
        return (self.left+self.width/2,self.page_height-self.top)
    
    def bottom_midpoint(self):
        return (self.left+self.width/2,self.page_height-self.top-self.height)
        
    def __str__(self):
        return f"Box(width: {self.width}, Height: {self.height}, Left: {self.left}, Top: {self.top}, Page width: {self.page_width}, Page height: {self.page_height}, Lower left: {self.lowerLeft}, Upper right: {self.upperRight})"

In [3]:
def find_bounding_box(box_id,aws_data):
    """
    This function returns the bounding box with a given ID.
    We recall that every box is uniquely determined by its ID.
    bb = bounding box.
    """
    bb = dict() # Dictionnary to contain the Bounding Box.
    for i in range(len(aws_data["Blocks"])):
        if aws_data["Blocks"][i]['Id'] == box_id:
            #print("ID FOUND!")
            bb = aws_data["Blocks"][i]['Geometry']['BoundingBox']
            break
    else:
        print("The ID given doesn't correspond to any box.")
        
    return bb

In [4]:
#############################################
""" Generate a PDF with boxes around text """ 
#############################################

f = pymupdf.open(pdf_path)

for i in range(f.page_count):
    
    page = f[0]
    #page = f.new_page(-1)
    page.wrap_contents()
    page_width = page.rect.width
    page_height = page.rect.height
    
    try:
        with open(json_path, 'r', encoding = "utf-8") as aws_output:
            # Convert JSON file to dictionary in Python
            aws_data = json.load(aws_output)
        
            box_text_id = [] 
        
            for i in range(len(aws_data["Blocks"])):
                if "Text" in aws_data["Blocks"][i] and aws_data["Blocks"][i]["BlockType"] == "WORD":
                    box_text_id.append(aws_data["Blocks"][i]['Id'])
                
        
    except:
        print("A problem reading the JSON file has been detected.")

    for id in box_text_id:
        bounding_box = find_bounding_box(id,aws_data)
        box = Box(bounding_box['Width'],bounding_box['Height'],bounding_box['Left'],bounding_box['Top'],page_width,page_height)
    
        rect = pymupdf.Rect(box.x_tl, box.y_tl, box.x_br, box.y_br)
        page.draw_rect( rect, color=[1,0,0], width=1)



##############
""" OUTPUT """
##############

f.save("./output/text-from-pdf-with-boxes.pdf")
f.close()

NameError: name 'pymupdf' is not defined

In [49]:
################################
""" Generate a PDF with text """ 
################################

doc = pymupdf.open()
page = doc.new_page()
text_writer = pymupdf.TextWriter(page.rect)
f = pymupdf.open(doc)

for i in range(len(aws_data["Blocks"])):
    if aws_data["Blocks"][i]['Id'] in box_text_id:
        bounding_box = find_bounding_box(aws_data["Blocks"][i]['Id'],aws_data)
        box = Box(bounding_box['Width'],bounding_box['Height'],bounding_box['Left'],bounding_box['Top'],page_width,page_height)
        coord = pymupdf.Point(box.x_bl,box.y_bl)
        # insert_text(point,...), the point is in the bottom-left position of the first character of text in pixels.
        page.insert_text(coord, aws_data["Blocks"][i]['Text'], fontsize=6)
            
        rect = pymupdf.Rect(box.x_tl, box.y_tl, box.x_br, box.y_br)
        #page.draw_rect( rect, color=[1,0,0], width=1, fill_opacity=0.5, fill=[1,0,0])
        page.draw_rect( rect, stroke_opacity=0)

##############
""" OUTPUT """
##############        
doc.save("./output/text-from-pdf.pdf")
doc.close()


In [50]:
######################################
""" Generate a PDF with accuracies """ 
######################################

doc = pymupdf.open()
page = doc.new_page()
text_writer = pymupdf.TextWriter(page.rect)
f = pymupdf.open(doc)

for i in range(len(aws_data["Blocks"])):
    if aws_data["Blocks"][i]['Id'] in box_text_id:
        bounding_box = find_bounding_box(aws_data["Blocks"][i]['Id'],aws_data)
        box = Box(bounding_box['Width'],bounding_box['Height'],bounding_box['Left'],bounding_box['Top'],page_width,page_height)
        coord = pymupdf.Point(box.x_bl,box.y_bl)
        # insert_text(point,...), the point is in the bottom-left position of the first character of text in pixels.
        page.insert_text(coord, aws_data["Blocks"][i]['Text'], fontsize=6)
        if aws_data["Blocks"][i]["Confidence"] > 90:
            color = [0,1,0]
            fill_opacity = 0.5
        elif aws_data["Blocks"][i]["Confidence"] <= 90 and aws_data["Blocks"][i]["Confidence"] > 80:
            color = [0,1,0]
            fill_opacity = 0.2
        elif aws_data["Blocks"][i]["Confidence"] <= 80 and aws_data["Blocks"][i]["Confidence"] > 70:
            color = [1,1,0]
            fill_opacity=0.5
        elif aws_data["Blocks"][i]["Confidence"] <= 70 and aws_data["Blocks"][i]["Confidence"] > 60:
            color = [1,1,0]
            fill_opacity=0.2
        elif aws_data["Blocks"][i]["Confidence"] <= 60 and aws_data["Blocks"][i]["Confidence"] > 50:
            color = [1,0.64,0]
            fill_opacity=0.2
        elif aws_data["Blocks"][i]["Confidence"] <= 50 and aws_data["Blocks"][i]["Confidence"] > 40:
            color = [1,0.64,0]
            fill_opacity=0.5
        elif aws_data["Blocks"][i]["Confidence"] <= 40 and aws_data["Blocks"][i]["Confidence"] > 30:
            color = [1,0,0]
            fill_opacity=0.2
        elif aws_data["Blocks"][i]["Confidence"] <= 30 and aws_data["Blocks"][i]["Confidence"] > 20:
            color = [1,0,0]
            fill_opacity=0.5
        elif aws_data["Blocks"][i]["Confidence"] <= 20 and aws_data["Blocks"][i]["Confidence"] > 10:
            color = [1,0,0]
            fill_opacity=0.7
        else:
            color = [1,1,0]
            fill_opacity=0.9
        
        rect = pymupdf.Rect(box.x_tl, box.y_tl, box.x_br, box.y_br)
        #page.draw_rect( rect, color=[1,0,0], width=1, fill_opacity=0.5, fill=[1,0,0])
        page.draw_rect( rect, fill_opacity=fill_opacity, stroke_opacity=0 , fill=color)

# Draw the boxes representing the accuracy
rect_90 = pymupdf.Rect(page_width -100, 20, page_width -60, 32)
page.draw_rect( rect_90, fill_opacity=0.5, stroke_opacity=0 , fill=[0,1,0])
coord_90 = pymupdf.Point(page_width -86,28)
page.insert_text(coord_90, ">90%", fontsize=6)

rect_80_90 = pymupdf.Rect(page_width -140, 20, page_width -100, 32)
page.draw_rect( rect_80_90, fill_opacity=0.2, stroke_opacity=0 , fill=[0,1,0])
coord_80_90 = pymupdf.Point(page_width -135,28)
page.insert_text(coord_80_90, "80% - 90%", fontsize=6)

rect_70_80 = pymupdf.Rect(page_width -180, 20, page_width -140, 32)
page.draw_rect( rect_70_80, fill_opacity=0.5, stroke_opacity=0 , fill=[1,1,0])
coord_70_80 = pymupdf.Point(page_width -174,28)
page.insert_text(coord_70_80, "70% - 80%", fontsize=6)

rect_60_70 = pymupdf.Rect(page_width -220, 20, page_width -180, 32)
page.draw_rect( rect_60_70, fill_opacity=0.2, stroke_opacity=0 , fill=[1,1,0])
coord_60_70 = pymupdf.Point(page_width -213,28)
page.insert_text(coord_60_70, "60% - 70%", fontsize=6)

rect_50_60 = pymupdf.Rect(page_width -260, 20, page_width -220, 32)
page.draw_rect( rect_50_60, fill_opacity=0.2, stroke_opacity=0 , fill=[1,0.64,0])
coord_50_60 = pymupdf.Point(page_width -254,28)
page.insert_text(coord_50_60, "50% - 60%", fontsize=6)

rect_40_50 = pymupdf.Rect(page_width -300, 20, page_width -260, 32)
page.draw_rect( rect_40_50, fill_opacity=0.5, stroke_opacity=0 , fill=[1,0.64,0])
coord_40_50 = pymupdf.Point(page_width -294,28)
page.insert_text(coord_40_50, "40% - 50%", fontsize=6)

rect_30_40 = pymupdf.Rect(page_width -340, 20, page_width -300, 32)
page.draw_rect( rect_30_40, fill_opacity=0.2, stroke_opacity=0 , fill=[1,0,0])
coord_30_40 = pymupdf.Point(page_width -334,28)
page.insert_text(coord_30_40, "30% - 40%", fontsize=6)

rect_20_30 = pymupdf.Rect(page_width -380, 20, page_width -340, 32)
page.draw_rect( rect_20_30, fill_opacity=0.4, stroke_opacity=0 , fill=[1,0,0])
coord_20_30 = pymupdf.Point(page_width -374,28)
page.insert_text(coord_20_30, "20% - 30%", fontsize=6)

rect_10_20 = pymupdf.Rect(page_width -420, 20, page_width -380, 32)
page.draw_rect( rect_10_20, fill_opacity=0.6, stroke_opacity=0 , fill=[1,0,0])
coord_10_20 = pymupdf.Point(page_width -414,28)
page.insert_text(coord_10_20, "10% - 20%", fontsize=6)

rect_10 = pymupdf.Rect(page_width -460, 20, page_width -420, 32)
page.draw_rect( rect_10, fill_opacity=0.9, stroke_opacity=0 , fill=[1,0,0])
coord_10 = pymupdf.Point(page_width -447,28)
page.insert_text(coord_10, "<10%", fontsize=6)

coord_acc = pymupdf.Point(page_width -495,28)
page.insert_text(coord_acc, "Accuracy :", fontsize=6)

##############
""" OUTPUT """
##############
doc.save("./output/text-from-pdf-with-accuracy.pdf")
doc.close()
