In [7]:
import sycamore

# s3 file path to the SortBenchmark dataset
paths = "https://www.faa.gov/sites/faa.gov/files/2022-12/FTW95FA129.pdf"
#paths = "/Users/moe/Downloads/sort.pdf"

# Initializng sycamore which also initializes Ray underneath
context = sycamore.init()

# Creating a DocSet
docset = context.read.binary(paths, parallelism=1, binary_format="pdf")

In [8]:
from sycamore.transforms.partition import UnstructuredPdfPartitioner

# We are using UnstructuredPdfPartitioner to partion the documents.
# Sycamore supports pluggable partitioners for different formats.
docset = docset.partition(partitioner=UnstructuredPdfPartitioner())

In [9]:
from sycamore.transforms.schema import OpenAISchema, OpenAIPropertyExtractor
from sycamore.llms import OpenAIModels, OpenAI
import os

openai = OpenAI(OpenAIModels.GPT_3_5_TURBO.value, api_key=os.environ.get("OPENAI_API_KEY"))

docset = docset.extract_schema(
            schema_extractor=OpenAISchema("AircraftAccident", llm=openai, num_of_elements=35)
        )

In [10]:
schema = docset.take(1)[0].properties["_schema"]

2023-11-14 13:49:38,184	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->Map(partition)->Map(extract_schema)] -> LimitOperator[limit=1]
2023-11-14 13:49:38,185	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-14 13:49:38,185	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
ntsb_path = "/Users/moe/Downloads/NTSB"
ntsb_docs = context.read.binary(ntsb_path, parallelism=1, binary_format="pdf")
ntsb_docs = ntsb_docs.partition(partitioner=UnstructuredPdfPartitioner())

def update_schema(doc):
    doc.properties["_schema"] = schema
    doc.properties["_schema_class"] = "AircraftAccident"
    return doc

ntsb_docs = ntsb_docs.limit(5).map(update_schema).extract_properties(
            property_extractor=OpenAIPropertyExtractor(llm=openai, num_of_elements=20)
        )
ntsb_docs.show()

2023-11-14 13:49:43,061	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->Map(partition)] -> LimitOperator[limit=5] -> TaskPoolMapOperator[Map(update_schema)->Map(extract_properties)] -> LimitOperator[limit=20]
2023-11-14 13:49:43,061	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-14 13:49:43,062	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'doc_id': 'b8223248-8337-11ee-ae37-9ec0ee1f5eae',
 'type': 'pdf',
 'binary_representation': b'<98716 bytes>',
 'properties': {'path': '/Users/moe/Downloads/NTSB/FTW95FA129.pdf',
                '_schema': {'type': 'object',
                            'properties': {'accidentNumber': {'type': 'string'},
                                           'location': {'type': 'string'},
                                           'dateAndTime': {'type': 'string',
                                                           'format': 'date-time'},
                                           'aircraft': {'type': 'string'},
                                           'definingEvent': {'type': 'string'},
                                           'flightConductedUnder': {'type': 'string'},
                                           'probableCause': {'type': 'string'},
                                           'registration': {'type': 'string'},
                                           'aircraftDamage