In [None]:
%load_ext autoreload
%autoreload 2

##### In this example, we will write the output of Sycamore job from a pdf in s3 bucket to a target location and extract images, and summarize them using gpt-4-turbo. 

##### The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

##### You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
doc_path = "../lib/sycamore/sycamore/tests/resources/data/pdfs/Transformer.pdf"

# For debug checking; be careful this will put your key in the jupyter notebook
#print(ArynConfig.get_aryn_api_key())

In [None]:
# This example calls the parititoner directly and prints the output
import logging
import os

from sycamore.utils.aryn_config import ArynConfig
from sycamore.transforms.detr_partitioner import ArynPDFPartitioner

logging.getLogger().setLevel(logging.INFO)

a = ArynPDFPartitioner(model_name_or_path=None)
with open(doc_path, "rb") as file:
    b = a.partition_pdf(file, aryn_api_key=ArynConfig.get_aryn_api_key())
    print(b)

In [None]:
# This example draws bounding boxes and labels document elements, and it then displays the image as output and writes them to S3. 

import ray
import sycamore
from sycamore.data import Document
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.image_utils import image_page_filename_fn
from sycamore.utils.pdf_utils import show_pages
from sycamore.utils.aryn_config import ArynConfig
from pathlib import Path
import os
import logging

sycamore.shutdown() # auto-reload can make this necessary
context = sycamore.init()

# This creates a DocSet and runs the Sycamore Partitioner. You can change the threshold (default is 0.4) or enable OCR.
# You can use this example document: s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf   # 
ds = context.read.binary(paths=["s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf"], binary_format="pdf")\
            .partition(partitioner=ArynPartitioner(extract_table_structure=True))

# This visualizes partitions inline in the notebook. 
show_pages(ds)

os.makedirs("/tmp/example", exist_ok=True)
# To save the visualized partitions for every page, you can use the following transforms.
ds.flat_map(split_and_convert_to_image)\
  .map_batch(DrawBoxes, f_constructor_kwargs={"draw_table_cells": True})\
  .write.files("/tmp/example", filename_fn=image_page_filename_fn)

# You can read from a S3 or local location. You can choose to read multiple PDFs from a folder, or specify just one PDF.

In [None]:
# This example partitions the document, extracts images, and summarizes them using gpt-4-turbo. 
import ray
import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from pathlib import Path

context = sycamore.init()

doc = (context.read.binary(paths="s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf", binary_format="pdf")
                .partition(partitioner=ArynPartitioner(extract_images=True))
# Summarize each image element.
                .transform(SummarizeImages)
                .explode()
# Filter image elements to make it easier to see the summarized results.
                .filter(lambda d: d.type == "Image")
                .show())