In [None]:
# This example draws bounding boxes and labels document elements, and it saves each page as an image file for demo purposes. 

import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from pathlib import Path

# Naming each output image file.
def image_page_filename(doc: Document):
    path = Path(doc.properties["path"])
    base_name = ".".join(path.name.split(".")[0:-1])
    page_num = doc.properties["page_number"]
    return f"{base_name}_page_{page_num}.png"

# This font is used for the labels in the visual represenation.
font_path= "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"


# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf
context = sycamore.init()
ds = (context.read.binary(paths=["s3://my-bucket/my-input-folder/"], binary_format="pdf")
                 .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=False, threshold=0.4))

#These are for drawing the bounding boxes and creating the label text for demo purposes.
                 .flat_map(split_and_convert_to_image)
                 .map_batch(DrawBoxes, f_constructor_args=[font_path], f_constructor_kwargs={"draw_table_cells": True})
                 .write.files("s3://my-bucket/my-output-folder/", filename_fn=image_page_filename))

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF.

In [None]:
# This example draws bounding boxes and labels document elements, and it then displays the image as output. 

import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from pathlib import Path
from PIL import Image
from io import BytesIO
from IPython.display import Image 

# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf
docs = (context.read.binary(paths=["s3://my-bucket/my-input-folder/"], binary_format="pdf")
                 .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=False, threshold=0.4))

#This is for drawing the bounding boxes and creating the label text for demo purposes.
                 .flat_map(split_and_convert_to_image)
                 .map_batch(DrawBoxes, f_constructor_args=[font_path], f_constructor_kwargs={"draw_table_cells": True})
                 .take_all())

#This displays the images.
images = [Image.open(BytesIO(d.binary_representation)) for d in docs]
images

for image in images:
    display(image)

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF.

In [None]:
# This example converts the DocSet to HTML and writes the tables to an HTML file.
import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from pathlib import Path

# Function to convert DocSet elements to HTML.
def convert_to_html(doc: Document) -> Document:
    new_elems = []
    for elem in doc.elements:
        if isinstance(elem, TableElement):
            html_str = elem.table.to_html()
            elem.text_representation = html_str
    return doc

# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf
docs = (context.read.binary(paths=["s3://my-bucket/my-input-folder/"],binary_format="pdf")
                   .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=False, threshold=0.4))

# Convert elements to HTML
                   .map(convert_to_html)
                   .explode()

#Filter table elements. You can also add other filers. For instance, you can use len(d["table"].cells) >= [number of cells] 
# to filter out smaller tables, for example, too.
                   .filter(lambda d: d.type == "table")
                   .write.files("s3://my-bucket/my-output-folder/", filename_fn=lambda d: "my_tables.html"))

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF. This writes to
# a file called my_tables.html.