In [8]:
import sycamore
from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from pathlib import Path

context = sycamore.init()

### time taken to extract and summarize images with ArynPartitioner running remotely

##### The Aryn Partitioner is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. You can also run the Aryn Partitioner locally by changing `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.


Replace the `aryn_api_key` with your key 

In [9]:
aryn_api_key = 'aryn_api_key'

In [6]:
%%time 


doc = (context.read.binary(paths="s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf", binary_format="pdf")
                .partition(partitioner=ArynPartitioner(extract_images=True,aryn_api_key=aryn_api_key ))
# Summarize each image element.
                .transform(SummarizeImages)
                .explode()
# Filter image elements to make it easier to see the summarized results.
                .filter(lambda d: d.type == "Image")
                .show())

2024-07-11 20:10:18,600	INFO set_read_parallelism.py:115 -- Using autodetected parallelism=8 for stage ReadBinary to satisfy parallelism at least twice the available number of CPUs (4).
2024-07-11 20:10:18,601	INFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 8, each read task output is split into 8 smaller blocks.
2024-07-11 20:10:18,602	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> TaskPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(summarize_all_images)->MapBatches(explode)->MapBatches(<lambda>)]
2024-07-11 20:10:18,603	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2024-07-11 20:10:18,604	I

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'type': 'Image',
 'bbox': [0.08557756311753217,
          0.06654054121537642,
          0.30315767176011027,
          0.10732177040793679],
 'properties': {'score': 0.7591715455055237,
                'image_size': [390, 110],
                'image_mode': 'RGB',
                'image_format': None,
                'page_number': 1,
                'summary': {'is_graph': False,
                            'summary': 'The image is a logo for the '
                                       "'Artificial Intelligence Index Report "
                                       "2023.' It features the acronym 'AI' in "
                                       'a stylized design, accompanied by the '
                                       'full title of the report. The logo is '
                                       'likely used to brand the report and '
                                       'indicate its focus on artificial '
                                       'intelligence.'}},
 'binary_rep

### time taken to extract and summarize images with ArynPartitioner running locally



##### The Aryn Partitioner is configured to run locally. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance. You can also choose to use the Aryn Partitioning Service, which provides fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service, set the `use_partitioning_service` to `False`, and provide aryn_api_key=your-api-key as an additional option.
 

In [7]:

%%time 


doc = (context.read.binary(paths="s3://aryn-public/sycamore-partitioner-examples/document-example-1.pdf", binary_format="pdf")
                .partition(partitioner=ArynPartitioner(extract_images=True, local=True ))
# Summarize each image element.
                .transform(SummarizeImages)
                .explode()
# Filter image elements to make it easier to see the summarized results.
                .filter(lambda d: d.type == "Image")
                .show())

ERROR:root:Unable to load aryn config /home/ec2-user/.aryn/config.yaml: [Errno 2] No such file or directory: '/home/ec2-user/.aryn/config.yaml'
2024-07-11 20:10:57,824	INFO set_read_parallelism.py:115 -- Using autodetected parallelism=8 for stage ReadBinary to satisfy parallelism at least twice the available number of CPUs (4).
2024-07-11 20:10:57,825	INFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 8, each read task output is split into 8 smaller blocks.
2024-07-11 20:10:57,826	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> TaskPoolMapOperator[Map(BinaryScan._to_document)] -> ActorPoolMapOperator[MapBatches(BaseMapTransformCallable___wrap)] -> TaskPoolMapOperator[MapBatches(summarize_all_images)->MapBatches(explode)->MapBatches(<lambda>)]
2024-07-11 20:10:57,826	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'type': 'Image',
 'bbox': (0.08557390100815716,
          0.06654155384410511,
          0.3031622673483456,
          0.10732093117453835),
 'properties': {'score': 0.758800745010376,
                'image_size': (390, 110),
                'image_mode': 'RGB',
                'image_format': None,
                'page_number': 1,
                'summary': {'is_graph': False,
                            'summary': 'The image is a logo for the '
                                       "'Artificial Intelligence Index Report "
                                       "2023'. It features the acronym 'AI' in "
                                       'stylized text, with the full title of '
                                       'the report written next to it. The '
                                       'logo is likely used to brand the '
                                       'report and does not contain any '
                                       'graphical data or axes.'}},
 'text_repr