##### The Aryn Partitioner is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. You can also run the Aryn Partitioner locally by changing `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.


##### In this example, we will write the output of the Sycamore from S3 to a target vector and keyword indexes in OpenSearch.


In [1]:
from sycamore.data import Document
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
import sycamore

Replace the `aryn_api_key` with your key 

In [2]:
aryn_api_key = 'eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoic29lYmhAYXJ5bi5haSIsImFjdCI6IjUxNDMyMDMyMTc3MiJ9LCJpYXQiOjE3MTk0Mjc1ODMuMDAxMDk0fQ.O_gBBkFfYftQxuyMxfYbAH15WW9tfhXPLJ9_jlFm68Pa5gS2gDNLtO2UpgV1pEUkNOshQcgZw0nlIcFUgOkHAQ'

In [3]:
from dateutil import parser

def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.isoformat()
    if parsed_date.utcoffset():
        doc.properties['entity']['isoDateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['isoDateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

### Data loading

In [4]:
s3_path = "s3://aryn-public/ntsb/"
llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

(MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)) pid=62972) The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:
docset = (
    ctx.read.binary(s3_path, parallelism=4, binary_format="pdf")
    .partition(partitioner=ArynPartitioner(aryn_api_key=aryn_api_key, aryn_partitioner_address="https://api.aryn.cloud/v1/document/partition"))
    )

### Data extraction

In [6]:
docset = (
    docset
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    .merge(GreedyTextElementMerger(tokenizer, 300))
    )

In [7]:
docset = docset.map(convert_timestamp)

In [8]:
sample_docs = docset.take()

2024-07-09 09:57:17,915	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-07-09_09-57-07_819296_62943/logs/ray-data
2024-07-09 09:57:17,916	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> ActorPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)] -> TaskPoolMapOperator[MapBatches(extract_properties)->MapBatches(merge_elements)->MapBatches(convert_timestamp)]


- ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract) 1: …

- MapBatches(extract_properties)->MapBatches(merge_elements)->MapBatches(convert_timestamp) 2:   0%|          …

Running 0:   0%|                                                                                              …

2024-07-09 10:08:07,045	ERROR streaming_executor_state.py:449 -- An exception was raised from a task of operator "ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)". Dataset execution will now abort. To ignore this exception and continue, set DataContext.max_errored_blocks.
2024-07-09 10:08:07,062	ERROR exceptions.py:63 -- Exception occurred in user code, with the abbreviated stack trace below. By default, the Ray Data internal stack trace is omitted from stdout, and only written to the Ray Data log files at /tmp/ray/session_2024-07-09_09-57-07_819296_62943/logs/ray-data. To output the full stack trace to stdout, set `DataContext.log_internal_stack_trace_to_stdout` to True.


RayTaskError(UserCodeException): [36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)()[39m (pid=62972, ip=127.0.0.1, actor_id=448723e021ce19d2d819c51801000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)))
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/detr_partitioner.py", line 209, in _partition_remote
    raise ArynPDFPartitionerException(
sycamore.transforms.detr_partitioner.ArynPDFPartitionerException: Error: status_code: 500, reason: Internal Server Error

The above exception was the direct cause of the following exception:

[36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)()[39m (pid=62972, ip=127.0.0.1, actor_id=448723e021ce19d2d819c51801000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)))
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/partition.py", line 483, in partition
    elements = partitioner.partition_pdf(
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/detr_partitioner.py", line 131, in partition_pdf
    return self._partition_remote(
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/tenacity/__init__.py", line 332, in wrapped_f
    return self(f, *args, **kw)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/tenacity/__init__.py", line 469, in __call__
    do = self.iter(retry_state=retry_state)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/tenacity/__init__.py", line 370, in iter
    result = action(retry_state)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/tenacity/__init__.py", line 413, in exc_check
    raise retry_exc from fut.exception()
tenacity.RetryError: RetryError[<Future at 0x16c1409a0 state=finished raised ArynPDFPartitionerException>]

The above exception was the direct cause of the following exception:

[36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)()[39m (pid=62972, ip=127.0.0.1, actor_id=448723e021ce19d2d819c51801000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)))
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 125, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/base.py", line 178, in ray_callable
    return BaseMapTransform._process_ray(ray_input, name, lambda d: f(d, *args, **kwargs), enable_auto_metadata)
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/base.py", line 230, in _process_ray
    outputs = f(docs)
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/base.py", line 178, in <lambda>
    return BaseMapTransform._process_ray(ray_input, name, lambda d: f(d, *args, **kwargs), enable_auto_metadata)
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/map.py", line 49, in _wrap
    return [f(d, *args, **kwargs) for d in docs]
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/map.py", line 49, in <listcomp>
    return [f(d, *args, **kwargs) for d in docs]
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/utils/time_trace.py", line 136, in wrapper
    return f(*args, **kwargs)
  File "/Users/sheb/Desktop/GITHUB/notebookchanges/sycamore/lib/sycamore/sycamore/transforms/partition.py", line 501, in partition
    raise RuntimeError(f"SycamorePartitioner Error processing {path}") from e
RuntimeError: SycamorePartitioner Error processing s3://aryn-public/ntsb/0.pdf

The above exception was the direct cause of the following exception:

[36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)()[39m (pid=62972, ip=127.0.0.1, actor_id=448723e021ce19d2d819c51801000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(_wrap)->MapBatches(BaseMapTransformCustom__Extract)))
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py", line 403, in submit
    yield from _map_task(
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 419, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 155, in apply_transform
    iter = transform_fn(iter, ctx)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 309, in __call__
    first = next(block_iter, None)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 392, in __call__
    for data in iter:
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 134, in _udf_timed_iter
    output = next(input)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 236, in __call__
    yield from self._batch_fn(input, ctx)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 208, in transform_fn
    res = fn(batch)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 127, in fn
    _handle_debugger_exception(e)
  File "/Users/sheb/Library/Caches/pypoetry/virtualenvs/sycamore-monorepo--rUP0lTI-py3.9/lib/python3.9/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 143, in _handle_debugger_exception
    raise UserCodeException() from e
ray.exceptions.UserCodeException

In [None]:
sample_docs[0].properties['_schema']

In [None]:
sample_docs[0].properties['entity']

In [None]:
for d in sample_docs:
    print(f"Path: {d.properties['path']}")
    for k in ["dateAndTime", "day", "aircraft", "location"]:
            print(f"{k: <25} {d.properties['entity'].get(k, 'None')}")
    print()

## Indexing

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder
import os

if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a Sycamore Jupyter container, using opensearch for OpenSearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for OpenSearch host")

index = "ntsb_Aryn"
os_client_args = {
    "hosts": [{"host": opensearch_host, "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                }
            }
        }
    }
}

In [None]:
docset = (
    docset
    .spread_properties(["entity", "path"])
    .explode()
    .sketch()
    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2"))
)

In [None]:
docset.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)