In [31]:
pip install reportlab -q

In [None]:
import json
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm

# Load the OCR JSON file
input_file = '/content/document.json'
output_file = '/content/document.pdf'

def create_pdf_from_ocr(json_path, output_pdf):
    # Open the JSON file
    with open(json_path, 'r', encoding='utf-8') as f:
        ocr_data = json.load(f)

    # Use A4 page size
    page_width, page_height = A4

    # Create a new PDF canvas
    c = canvas.Canvas(output_pdf, pagesize=A4)

    # Full document text
    document_text = ocr_data.get('text', '')

    # Set default font size (you can adjust this as needed)
    font_size = 10
    c.setFont("Helvetica", font_size)

    # Calculate scaling factors
    first_page = ocr_data['pages'][0]
    original_width = first_page['dimension']['width']
    original_height = first_page['dimension']['height']
    scale_x = page_width / original_width
    scale_y = page_height / original_height

    # Loop through pages and extract layout information
    for page in ocr_data['pages']:
        # Loop through blocks of text
        for block in page['blocks']:
            # Extract bounding box information
            if 'boundingBox' in block['layout']:
                block_bbox = block['layout']['boundingBox']
                start_x = block_bbox.get('left', 0) * scale_x
                start_y = page_height - (block_bbox.get('top', 0) * scale_y)  # Invert y-axis for PDF coordinates
                block_width = block_bbox.get('width', 0) * scale_x
            elif 'boundingPoly' in block['layout']:
                block_bbox = block['layout']['boundingPoly']
                if 'vertices' in block_bbox:
                    vertices = block_bbox['vertices']
                    start_x = vertices[0].get('x', 0) * scale_x
                    start_y = page_height - (vertices[0].get('y', 0) * scale_y)  # Invert y-axis for PDF coordinates
                    block_width = (vertices[1].get('x', 0) - vertices[0].get('x', 0)) * scale_x
                elif 'normalizedVertices' in block_bbox:
                    vertices = block_bbox['normalizedVertices']
                    start_x = vertices[0].get('x', 0) * page_width
                    start_y = page_height - (vertices[0].get('y', 0) * page_height)  # Invert y-axis for PDF coordinates
                    block_width = (vertices[1].get('x', 0) - vertices[0].get('x', 0)) * page_width
            else:
                print(f"Warning: Unable to determine bounding box for block: {block}")
                continue

            # Extract text segments
            text_segments = block['layout']['textAnchor'].get('textSegments', [])
            if text_segments:
                start_index = text_segments[0].get('startIndex', 0)
                end_index = text_segments[0].get('endIndex', 0)

                # Extract the text content from the document using the indices
                text_content = document_text[int(start_index):int(end_index)]

                # Replace '/n' with actual newlines
                text_content = text_content.replace('/n', '\n')

                # Split the text into lines based on newline characters
                text_lines = text_content.split('\n')

                # Process each line
                for line in text_lines:
                    # Word-wrap the line to fit within the bounding box width
                    wrapped_lines = wrap_text(c, line, block_width, font_size)

                    # Draw each wrapped line of text on the PDF
                    for wrapped_line in wrapped_lines:
                        c.drawString(start_x, start_y, wrapped_line)
                        start_y -= font_size * 1.2  # Move down to the next line

                        # Check if we run out of space on the current page
                        if start_y < font_size:
                            c.showPage()  # Create a new page
                            c.setFont("Helvetica", font_size)
                            start_y = page_height - font_size

        # Finish the current page
        c.showPage()

    # Save the PDF
    c.save()

def wrap_text(canvas, text, max_width, font_size):
    """
    Splits the text into lines that fit within the given width.
    """
    words = text.split(' ')
    lines = []
    current_line = ""

    for word in words:
        # Check if the current line + new word fits in the width
        if canvas.stringWidth(current_line + word, "Helvetica", font_size) < max_width:
            current_line += word + " "
        else:
            lines.append(current_line.strip())
            current_line = word + " "

    # Append the last line
    if current_line:
        lines.append(current_line.strip())

    return lines

# Create the PDF from OCR JSON
create_pdf_from_ocr(input_file, output_file)

print(f"PDF created at {output_file}")

PDF created at /content/document.pdf
