In [2]:
import sycamore
import os
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import (
    OpenAIPropertyExtractor,
)
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.data.element import Element
from sycamore.data.document import Document
from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [3]:
context = sycamore.init()
# local file path to the SortBenchmark dataset
paths = "<PUT_YOUR_PATH_HERE>"
initial_docset = context.read.binary(paths, binary_format="pdf")
#initial_docset = context.read.aryn(aryn_url="https://test-api.aryn.ai", docset_id="aryn:f-trcw7rui6kg2t9os03owzjf")

In [4]:
## set your keys here

In [5]:
tokenizer = OpenAITokenizer(OpenAIModels.GPT_4O.value.name)
llm = OpenAI(OpenAIModels.GPT_4O.value)

In [6]:
schema_json = {
        "type": "object",
        "properties": {
            "company_name": {"type": "string", "description": "name of the  company"},
            "company_ticker": {"type": "string", "description": "Ticker of the  company"},
            "quarter": {
                "type": "string",
                "description": "Quarter of the earnings call, it should be in the format of Q1, Q2, Q3, Q4",
            },
            "date":{"type": "string", "description": "The date of the earnings call"}
        },
    }

In [7]:
def add_property_to_schema(doc: Document) -> Document:
    schema_json = {
        "type": "object",
        "properties": {
            "company_name": {"type": "string", "description": "name of the  company"},
            "company_ticker": {"type": "string", "description": "Ticker of the  company"},
            "quarter": {
                "type": "string",
                "description": "Quarter of the earnings call, it should be in the format of Q1, Q2, Q3, Q4",
            },
            "date":{"type": "string", "description": "The date of the earnings call"}
        },
    }

    doc.properties.update({"_schema": schema_json, "_schema_class": "earnings_call"})
    return doc

In [17]:
from sycamore import MaterializeSourceMode
partitioned_docset = (initial_docset.partition(partitioner=ArynPartitioner())
                      .materialize(path="/Users/abhijitpujare/workspace/haystack-workshop-2025/materialize/partitioned_docset", source_mode=MaterializeSourceMode.USE_STORED)
                      .split_elements(tokenizer=tokenizer, max_tokens=512)
                      .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, schema=schema_json, schema_name="earnings_calls")))

In [18]:

def filterInSpeakers(elem: Element) -> bool:
  if elem.type == 'Image' or elem.type == 'table' or elem.type == 'Page-footer' or elem.type == 'Footnote':
    return False
  return True

def removeOriginalElements(doc: Document) -> Document:
    if '_original_elements' in doc.properties:
        del doc.properties['_original_elements']
    return doc


filtered_Docset = partitioned_docset.filter_elements(filterInSpeakers)
docset_no_orig_elements = filtered_Docset.map(removeOriginalElements)

In [19]:
## Replace [^ ]*[^\S\n\t][^ ]*[^\S\n\t]--[^\S\n\t][^ ]*
import re
from sycamore.data import Document
def markSpeakers(doc:Element) -> Element:
  if 'text_representation' not in doc:
    return doc

  external_speaker = re.match('[^ ]*[^\S\n\t][^ ]*[^\S\n\t]--[^\S\n\t].*--', doc.data['text_representation'])
  internal_speaker = re.match('[^ ]*[^\S\n\t][^ ]*[^\S\n\t]--.*', doc.data['text_representation'])
  doc_properties = doc.properties
  if doc.text_representation == 'Operator':
    doc_properties['speaker'] = True
    doc_properties['speaker_role'] = 'Operator'
    doc_properties['speaker_name'] = 'Operator'
  elif external_speaker:
    location = doc.text_representation.find('--')
    location2 = location + doc.text_representation[location+2:].find('--')
    doc_properties['speaker_name'] = doc.text_representation[:location].lstrip()
    doc_properties['speaker_external_org'] = doc.text_representation[location+2:location2+1].lstrip()
    doc_properties['speaker_role'] = doc.text_representation[location2+4:].lstrip()
    doc_properties['speaker'] = True
  elif internal_speaker:
    location = doc.text_representation.find('--')
    doc_properties['speaker_name'] = doc.text_representation[:location].lstrip()
    doc_properties['speaker_role'] = doc.text_representation[location+2:].lstrip()
    doc_properties['speaker'] = True

  return doc

#filtered_Docset.map_elements(markSpeakers).show()
speakersMarkedDocSet = docset_no_orig_elements.map_elements(markSpeakers)

In [20]:
def mergeDialogue(doc: Document) -> Document:
  last_speaker_elem = None
  curr_text_representation = ''
  for elem in doc.elements:
    
    if 'speaker' in elem.properties and last_speaker_elem is not None:
      last_speaker_elem.text_representation = curr_text_representation
      last_speaker_elem = elem
      curr_text_representation = ''
    elif 'speaker' in elem.properties and elem.properties['speaker']:
      last_speaker_elem = elem
      curr_text_representation = ''
    elif last_speaker_elem is not None and elem.text_representation:
      curr_text_representation += elem.text_representation

      if last_speaker_elem is not None and 'text_representation' in last_speaker_elem.properties:
          last_speaker_elem.text_representation = curr_text_representation
  return doc


mergedDialogeSet = speakersMarkedDocSet.map(mergeDialogue)

In [21]:
def filterOnlySpeakers(elem: Element):
    return 'speaker' in elem.properties

finalDocSet = mergedDialogeSet.filter_elements(filterOnlySpeakers)
#finalDocSet.show()

In [22]:
from sycamore.transforms.embed import OpenAIEmbedder
model_name = "text-embedding-3-small"
from aryn_sdk.client.client import Client 
myClient = Client(aryn_url="https://test-api.aryn.ai", aryn_api_key="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoiZG9jc3RvcmUtdGVhbUBhcnluLmFpIiwiYWN0IjoiMjI0NjYxMzg4MzA4In0sImlhdCI6MTczMzk1MDY3OH0.xqmVCfnu0RJ2RW-74-to4_hbeTIZflToj7YZs6vGtNxCkNroEE5Quzro8ztxTdI6Yt-9HzPepIXdA2QnKDtWBQ")
docset = myClient.create_docset(name="haystack_workshop_target_correct")

In [None]:
finalDocSet.embed(embedder=OpenAIEmbedder(model_name=model_name)).write.aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset.value.docset_id, aryn_api_key="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoiZG9jc3RvcmUtdGVhbUBhcnluLmFpIiwiYWN0IjoiMjI0NjYxMzg4MzA4In0sImlhdCI6MTczMzk1MDY3OH0.xqmVCfnu0RJ2RW-74-to4_hbeTIZflToj7YZs6vGtNxCkNroEE5Quzro8ztxTdI6Yt-9HzPepIXdA2QnKDtWBQ")

  finalDocSet.embed(embedder=OpenAIEmbedder(model_name=model_name)).write.aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset.value.docset_id, aryn_api_key="eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzdWIiOnsiZW1sIjoiZG9jc3RvcmUtdGVhbUBhcnluLmFpIiwiYWN0IjoiMjI0NjYxMzg4MzA4In0sImlhdCI6MTczMzk1MDY3OH0.xqmVCfnu0RJ2RW-74-to4_hbeTIZflToj7YZs6vGtNxCkNroEE5Quzro8ztxTdI6Yt-9HzPepIXdA2QnKDtWBQ")
  writer: Node = ArynWriter(self.plan, client_params=client_params, target_params=target_params, **kwargs)
  from .autonotebook import tqdm as notebook_tqdm
2025-03-26 16:46:26,934	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-03-26 16:46:27,907	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2025-03-26 16:46:28,630	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the

In [None]:
docset = myClient.get_docset(docset_id=docset.value.docset_id)

In [None]:
docset.value