In [14]:
import sycamore

# s3 file path to the SortBenchmark dataset
paths = "s3://aryn-datasets-us-east-1/sort_benchmark/index.html"

# Initializng sycamore which also initializes Ray underneath
context = sycamore.init()

# Creating a DocSet
docset = context.read.binary(paths, parallelism=1, binary_format="html")

In [15]:
from sycamore.transforms.partition import HtmlPartitioner

# We are using UnstructuredPdfPartitioner to partion the documents.
# Sycamore supports pluggable partitioners for different formats.
docset = docset.partition(partitioner=HtmlPartitioner())

In [16]:
from sycamore.transforms.schema import OpenAISchema, OpenAIPropertyExtractor
from sycamore.llms import OpenAIModels, OpenAI
import os


# We are using OpenAIEntityExtractor which utilizes OpenAI and gpt-3.5-turbo model.
# You can write your own EntityExtractor as well.

# Replace the "api-key" with your API Key.
openai = OpenAI(OpenAIModels.GPT_3_5_TURBO.value, api_key=os.environ.get("OPENAI_API_KEY"))

docset = docset.extract_schema(
            schema_extractor=OpenAISchema("BenchmarkRun", llm=openai, num_of_elements=50)
        ).explode(
        ).limit(
            10
        ).extract_properties(
            property_extractor=OpenAIPropertyExtractor(llm=openai)
        )

In [17]:
docset.show()

2023-11-09 12:22:56,268	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->Map(partition)->Map(extract_schema)->FlatMap(explode)] -> LimitOperator[limit=10] -> TaskPoolMapOperator[Map(extract_properties)] -> LimitOperator[limit=20]
2023-11-09 12:22:56,269	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-09 12:22:56,269	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'doc_id': 'c51682c2-7f3d-11ee-98d5-9ec0ee1f5eae',
 'type': 'html',
 'binary_representation': b'<62442 bytes>',
 'properties': {'path': 's3://aryn-datasets-us-east-1/sort_benchmark/index.html',
                'title': 'Sort Benchmark Home Page',
                '_schema': {'BenchmarkRun': {'name': 'string',
                                             'year': 'integer',
                                             'recordsSorted': 'integer',
                                             'sortTimeSeconds': 'number',
                                             'hardware': {'processor': 'string',
                                                          'memory': 'string',
                                                          'storage': 'string'},
                                             'participants': [{'name': 'string',
                                                               'affiliation': 'string'}]}},
                '_schema_class': 'BenchmarkRun',
                'e

In [18]:
docset.filter(lambda doc: doc.properties["entity"] != "None" and doc.properties["entity"]["BenchmarkRun"]["year"] == 2011).show()

2023-11-09 12:22:58,285	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->Map(partition)->Map(extract_schema)->FlatMap(explode)] -> LimitOperator[limit=10] -> TaskPoolMapOperator[Map(extract_properties)->MapBatches(<lambda>)] -> LimitOperator[limit=20]
2023-11-09 12:22:58,286	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-11-09 12:22:58,287	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'type': 'text',
 'text_representation': '\n'
                        'Zhipeng Tang,\n'
                        'Zhiqiang Lv,\n'
                        '\n'
                        'Bin Lu,\n'
                        'Yangyu Tao,\n'
                        'Chao Li,\n'
                        'Jingren Zhou,\n'
                        'Hong Tang\n'
                        '\n'
                        'Alibaba Group Inc <900 chars>',
 'properties': {'path': 's3://aryn-datasets-us-east-1/sort_benchmark/index.html',
                'title': 'Sort Benchmark Home Page',
                '_schema': {'BenchmarkRun': {'name': 'string',
                                             'year': 'integer',
                                             'recordsSorted': 'integer',
                                             'sortTimeSeconds': 'number',
                                             'hardware': {'processor': 'string',
                                                          'memory': 'stri