In [None]:
# This example draws bounding boxes and labels document elements, and it then displays the image as output and writes them to S3. 

import sycamore
from sycamore.data import Document
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.utils.image_utils import image_page_filename_fn
from sycamore.utils.pdf_utils import show_pages
from pathlib import Path

context = sycamore.init()

# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf
ds = context.read.binary(paths=["s3://my-bucket/my-input-folder"], binary_format="pdf")\
            .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=False, threshold=0.4))

# This visualizes partitions inline in the notebook. 
show_pages(ds)

# To save the visualized partitions for every page, you can use the following transforms.
ds.flat_map(split_and_convert_to_image)\
  .map_batch(DrawBoxes, f_constructor_kwargs={"draw_table_cells": True})\
  .write.files("s3://my-bucket/my-output-folder", filename_fn=image_page_filename_fn)

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF.

In [None]:
# This example partitions the document, extracts images, and summarizes them using gpt-4-turbo. 
import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from pathlib import Path

context = sycamore.init()

doc = (context.read.binary(paths="s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf", binary_format="pdf")
                .partition(partitioner=SycamorePartitioner(extract_images=True))
# Summarize each image element.
                .transform(SummarizeImages)
                .explode()
# Filter image elements to make it easier to see the summarized results.
                .filter(lambda d: d.type == "Image")
                .show())