In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from sycamore.utils.aryn_config import ArynConfig

doc_path = "../lib/sycamore/sycamore/tests/resources/data/pdfs/Transformer.pdf"

# For debug checking; be careful this will put your key in the jupyter notebook
#print(ArynConfig.get_aryn_api_key())

In [None]:
# This example calls the parititoner directly and prints the output
import logging
import os

from sycamore.utils.aryn_config import ArynConfig
from sycamore.transforms.detr_partitioner import ArynPDFPartitioner

logging.getLogger().setLevel(logging.INFO)

a = ArynPDFPartitioner(model_name_or_path=None)
with open(doc_path, "rb") as file:
    b = a.partition_pdf(file, aryn_api_key=ArynConfig.get_aryn_api_key())
    print(b)

In [None]:
# This example draws bounding boxes and labels document elements, and it then displays the image as output and writes them to S3. 

import ray
import sycamore
from sycamore.data import Document
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.image_utils import image_page_filename_fn
from sycamore.utils.pdf_utils import show_pages
from sycamore.utils.aryn_config import ArynConfig
from pathlib import Path
import os
import logging

sycamore.shutdown() # auto-reload can make this necessary
context = sycamore.init()

# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf
ds = context.read.binary(paths=[doc_path], binary_format="pdf")\
            .partition(partitioner=ArynPartitioner(model_name_or_path=None,
                                                   extract_table_structure=True, use_ocr=False, threshold=0.4))

# This visualizes partitions inline in the notebook. 
show_pages(ds)

os.makedirs("/tmp/example", exist_ok=True)
# To save the visualized partitions for every page, you can use the following transforms.
ds.flat_map(split_and_convert_to_image)\
  .map_batch(DrawBoxes, f_constructor_kwargs={"draw_table_cells": True})\
  .write.files("/tmp/example", filename_fn=image_page_filename_fn)

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF.

In [None]:
# This example partitions the document, extracts images, and summarizes them using gpt-4-turbo. 
import ray
import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from pathlib import Path

context = sycamore.init()

doc = (context.read.binary(paths="s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf", binary_format="pdf")
                .partition(partitioner=SycamorePartitioner(extract_images=True))
# Summarize each image element.
                .transform(SummarizeImages)
                .explode()
# Filter image elements to make it easier to see the summarized results.
                .filter(lambda d: d.type == "Image")
                .show())