In [None]:
import os
from pathlib import Path

import sycamore
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import (
    OpenAIPropertyExtractor,
)
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.data.element import Element
from sycamore.data.document import Document
from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder
from aryn_sdk.client.client import Client
from sycamore.transforms.embed import OpenAIEmbedder
from sycamore import MaterializeSourceMode

In [None]:
aryn_api_key="<REDACTED>"
prod_api_key=aryn_api_key # set this to a prod key while we're testing in test
paths = "files/actual_files/"
docset_name = "haystack_4_0_1"
materialize_dir = Path("materialize")

In [None]:
## Set your api-keys. You'll need an ARYN_API_KEY and an OPENAI_API_KEY
context = sycamore.init()
initial_docset = context.read.binary(paths, binary_format="pdf")

In [None]:
tokenizer = OpenAITokenizer(OpenAIModels.GPT_4O.value.name)
llm = OpenAI(OpenAIModels.GPT_4O.value)

In [None]:
schema_json = {
        "type": "object",
        "properties": {
            "company_name": {"type": "string", "description": "name of the  company"},
            "company_ticker": {"type": "string", "description": "Ticker of the  company"},
            "quarter": {
                "type": "string",
                "description": "Quarter of the earnings call, it should be in the format of Q1, Q2, Q3, Q4",
            },
            "date":{"type": "string", "description": "The date of the earnings call"}
        },
    }

In [None]:
partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner(aryn_api_key=prod_api_key))
                      .materialize(path=materialize_dir / "partitioned", source_mode=MaterializeSourceMode.USE_STORED)
                      .split_elements(tokenizer=tokenizer, max_tokens=512)
                      .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, schema=schema_json, schema_name="earnings_call")))

In [None]:
def filter_out_cruft(elem: Element) -> bool:
    return elem.type not in ('Image', 'Page-footer', 'Footnote', 'Page-header', 'Caption')

filtered_docset = partitioned_docset.filter_elements(filter_out_cruft)

In [None]:
## Replace [^ ]*[^\S\n\t][^ ]*[^\S\n\t]--[^\S\n\t][^ ]*
import re
from sycamore.data import Document, Element
def markSpeakers(doc: Element) -> Element:
    if not doc.text_representation:
        return doc

    external_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--[^\S\n\t].*--', doc.text_representation)
    internal_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--.*', doc.text_representation)
    if doc.text_representation.strip() == 'Operator':
        doc.properties['speaker'] = True
        doc.properties['speaker_role'] = 'Operator'
        doc.properties['speaker_name'] = 'Operator'
        doc.data["_break"] = True
    elif external_speaker:
        location = doc.text_representation.find('--')
        location2 = location + doc.text_representation[location+2:].find('--')
        doc.properties['speaker_name'] = doc.text_representation[:location].strip()
        doc.properties['speaker_external_org'] = doc.text_representation[location+2:location2+1].strip()
        doc.properties['speaker_role'] = doc.text_representation[location2+4:].strip()
        doc.properties['speaker'] = True
        doc.data["_break"] = True
    elif internal_speaker:
        location = doc.text_representation.find('--')
        doc.properties['speaker_name'] = doc.text_representation[:location].strip()
        doc.properties['speaker_role'] = doc.text_representation[location+2:].strip()
        doc.properties['speaker'] = True
        doc.data["_break"] = True
    return doc

speakers_marked_ds = filtered_docset.map_elements(markSpeakers)

In [None]:
from sycamore.transforms.merge_elements import MarkedMerger

merged_ds = speakers_marked_ds.merge(MarkedMerger())

In [None]:
# This transform moves a bunch of properties to a nested dict properties.entity. 
# Useful for making sure some downstream processing works correctly
def prop_to_entity(props: list[str]):
    def prop_to_entity_inner(doc_or_elt):
        if "entity" not in doc_or_elt.properties:
            doc_or_elt.properties["entity"] = {}
        if not isinstance(doc_or_elt.properties["entity"], dict):
            doc_or_elt.properties["entity"] = {"original_entity": doc_or_elt.properties.pop("entity")}
        for p in props:
            if p in doc_or_elt.properties:
                doc_or_elt.properties["entity"][p] = doc_or_elt.properties.pop(p)
        return doc_or_elt
    return prop_to_entity_inner

rejiggered_ds = merged_ds.map(prop_to_entity(["earnings_call"])).spread_properties(["path", "entity"]).map_elements(prop_to_entity(["speaker", "speaker_role", "speaker_name", "speaker_external_org"]))

In [None]:
model_name = "text-embedding-3-small"
myClient = Client(aryn_url="https://test-api.aryn.ai", aryn_api_key=aryn_api_key)
docset = myClient.create_docset(name=docset_name)

In [None]:
rejiggered_ds.embed(embedder=OpenAIEmbedder(model_name=model_name)).write.aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset.value.docset_id, aryn_api_key=aryn_api_key, autoschema=True)

In [None]:
docset = myClient.get_docset(docset_id=docset.value.docset_id)

In [None]:
docset.value

In [None]:
docset.value.docset_id