In [1]:
import os
from pathlib import Path

import sycamore
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import (
    OpenAIPropertyExtractor,
)
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.data.element import Element
from sycamore.data.document import Document
from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder
from aryn_sdk.client.client import Client
from sycamore.transforms.embed import OpenAIEmbedder
from sycamore import MaterializeSourceMode

In [2]:
aryn_api_key="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoiZG9jc3RvcmUtdGVhbUBhcnluLmFpIiwiYWN0IjoiMjI0NjYxMzg4MzA4In0sImlhdCI6MTczMzk1MDY3OH0.xqmVCfnu0RJ2RW-74-to4_hbeTIZflToj7YZs6vGtNxCkNroEE5Quzro8ztxTdI6Yt-9HzPepIXdA2QnKDtWBQ"
prod_api_key=aryn_api_key # set this to a prod key while we're testing in test
prod_api_key="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoiaGVucnlAYXJ5bi5haSIsImFjdCI6IjkwNzc4NzkxNzUwNCJ9LCJpYXQiOjE3MTkzNTMzMDcuOTM0NTcyfQ.UHrI3Q5Gx-plRQ47AIVY3dRJrEo-VjU2u9GV3sad5kc2lLrALYC7vCkKvB-yUnEHdXv4ftCKZeKyxztxEXGBCw"
paths = "files/actual_files/"
docset_name = "haystack_4_0_1"
materialize_dir = Path("materialize")

In [3]:
## Set your api-keys. You'll need an ARYN_API_KEY and an OPENAI_API_KEY
context = sycamore.init()
initial_docset = context.read.binary(paths, binary_format="pdf")

In [4]:
tokenizer = OpenAITokenizer(OpenAIModels.GPT_4O.value.name)
llm = OpenAI(OpenAIModels.GPT_4O.value)

In [5]:
schema_json = {
        "type": "object",
        "properties": {
            "company_name": {"type": "string", "description": "name of the  company"},
            "company_ticker": {"type": "string", "description": "Ticker of the  company"},
            "quarter": {
                "type": "string",
                "description": "Quarter of the earnings call, it should be in the format of Q1, Q2, Q3, Q4",
            },
            "date":{"type": "string", "description": "The date of the earnings call"}
        },
    }

In [6]:
partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner(aryn_api_key=prod_api_key))
                      .materialize(path=materialize_dir / "partitioned", source_mode=MaterializeSourceMode.USE_STORED)
                      .split_elements(tokenizer=tokenizer, max_tokens=512)
                      .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, schema=schema_json, schema_name="earnings_call")))

In [7]:
def filter_out_cruft(elem: Element) -> bool:
    return elem.type not in ('Image', 'Page-footer', 'Footnote', 'Page-header', 'Caption')

filtered_docset = partitioned_docset.filter_elements(filter_out_cruft)

In [8]:
## Replace [^ ]*[^\S\n\t][^ ]*[^\S\n\t]--[^\S\n\t][^ ]*
import re
from sycamore.data import Document, Element
def markSpeakers(doc: Element) -> Element:
    if not doc.text_representation:
        return doc

    external_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--[^\S\n\t].*--', doc.text_representation)
    internal_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--.*', doc.text_representation)
    if doc.text_representation.strip() == 'Operator':
        doc.properties['speaker'] = True
        doc.properties['speaker_role'] = 'Operator'
        doc.properties['speaker_name'] = 'Operator'
        doc.data["_break"] = True
    elif external_speaker:
        location = doc.text_representation.find('--')
        location2 = location + doc.text_representation[location+2:].find('--')
        doc.properties['speaker_name'] = doc.text_representation[:location].strip()
        doc.properties['speaker_external_org'] = doc.text_representation[location+2:location2+1].strip()
        doc.properties['speaker_role'] = doc.text_representation[location2+4:].strip()
        doc.properties['speaker'] = True
        doc.data["_break"] = True
    elif internal_speaker:
        location = doc.text_representation.find('--')
        doc.properties['speaker_name'] = doc.text_representation[:location].strip()
        doc.properties['speaker_role'] = doc.text_representation[location+2:].strip()
        doc.properties['speaker'] = True
        doc.data["_break"] = True
    return doc

speakers_marked_ds = filtered_docset.map_elements(markSpeakers)

In [9]:
from sycamore.transforms.merge_elements import MarkedMerger

merged_ds = speakers_marked_ds.merge(MarkedMerger())

In [10]:
def prop_to_entity(props: list[str]):
    def prop_to_entity_inner(doc_or_elt):
        if "entity" not in doc_or_elt.properties:
            doc_or_elt.properties["entity"] = {}
        if not isinstance(doc_or_elt.properties["entity"], dict):
            doc_or_elt.properties["entity"] = {"original_entity": doc_or_elt.properties.pop("entity")}
        for p in props:
            if p in doc_or_elt.properties:
                doc_or_elt.properties["entity"][p] = doc_or_elt.properties.pop(p)
        return doc_or_elt
    return prop_to_entity_inner

rejiggered_ds = merged_ds.map(prop_to_entity(["earnings_call"])).spread_properties(["path", "entity"]).map_elements(prop_to_entity(["speaker", "speaker_role", "speaker_name", "speaker_external_org"]))

In [11]:
model_name = "text-embedding-3-small"
myClient = Client(aryn_url="https://test-api.aryn.ai", aryn_api_key=aryn_api_key)
docset = myClient.create_docset(name=docset_name)

In [12]:
rejiggered_ds.embed(embedder=OpenAIEmbedder(model_name=model_name)).write.aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset.value.docset_id, aryn_api_key=aryn_api_key, autoschema=True)

  rejiggered_ds.embed(embedder=OpenAIEmbedder(model_name=model_name)).write.aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset.value.docset_id, aryn_api_key=aryn_api_key, autoschema=True)
  writer: Node = ArynWriter(self.plan, client_params=client_params, target_params=target_params, **kwargs)
2025-04-01 11:17:05,737	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


sycamore.materialize - INFO - 2025-04-01 18:17:06,440 - Using materialize/partitioned as the cached source of data


(pid=41232) INFO:root:Spurious log 1: Verifying that log messages are propagated
2025-04-01 11:17:07,952	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-04-01_11-17-04_788603_41209/logs/ray-data
2025-04-01 11:17:07,952	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary->FlatMap(Materialize._ray_to_document)->MapBatches(split_doc)->MapBatches(llm_map)->MapBatches(parse_json_and_cast)->MapBatches(process_doc)->MapBatches(process_doc)->MapBatches(merge_elements)->MapBatches(prop_to_entity_inner)->MapBatches(spread_properties)->MapBatches(process_doc)->MapBatches(OpenAIEmbedder)->MapBatches(_write_docs_tt)]


Running 0: 0.00 row [00:00, ? row/s]



- ReadBinary->FlatMap(Materialize._ray_to_document)->...->MapBatches(_write_docs_tt) 1: 0.00 row [00:00, ? rowâ€¦

(ReadBinary->FlatMap(Materialize._ray_to_document)->MapBatches(split_doc)->MapBatches(llm_map)->MapBatches(parse_json_and_cast)->MapBatches(process_doc)->MapBatches(process_doc)->MapBatches(merge_elements)->MapBatches(prop_to_entity_inner)->MapBatches(spread_properties)->MapBatches(process_doc)->MapBatches(OpenAIEmbedder)->MapBatches(_write_docs_tt) pid=41235) INFO:httpx:HTTP Request: POST https://oai.helicone.ai/v1/chat/completions "HTTP/1.1 200 OK"
(pid=41237) INFO:root:Spurious log 1: Verifying that log messages are propagated [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)
(ReadBinary->FlatMap(Materialize._ray_to_document)->MapBatches(split_doc)->MapBatches(llm_map)->MapBatches(parse_json_and_cast)->MapBatches(process_doc)->MapBatches(process_doc)->MapBatches(merge_elements)->MapBatches(prop

In [13]:
docset = myClient.get_docset(docset_id=docset.value.docset_id)

In [14]:
docset.value

DocSetMetadata(account_id='224661388308', docset_id='aryn:ds-4tu7utxr3u21r0q15iz91zn', name='haystack_4_0_1', created_at=datetime.datetime(2025, 4, 1, 18, 17, 2, 147698), readonly=False, properties=None, size=92, query_schema=Schema(fields=[SchemaField(name='speaker_name', field_type='str', default=None, description=None, examples=None), SchemaField(name='speaker_role', field_type='str', default=None, description=None, examples=None), SchemaField(name='speaker', field_type='bool', default=None, description=None, examples=None), SchemaField(name='earnings_call.date', field_type='str', default=None, description=None, examples=None), SchemaField(name='earnings_call.quarter', field_type='str', default=None, description=None, examples=None), SchemaField(name='earnings_call.company_ticker', field_type='str', default=None, description=None, examples=None), SchemaField(name='earnings_call.company_name', field_type='str', default=None, description=None, examples=None), SchemaField(name='speaker

In [15]:
docset.value.docset_id

'aryn:ds-4tu7utxr3u21r0q15iz91zn'