In [None]:
import sycamore
import os
from pathlib import Path
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import (
    OpenAIPropertyExtractor,
)
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.data.element import Element
from sycamore.data.document import Document
from sycamore.functions import HuggingFaceTokenizer, OpenAITokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.llms.prompts.default_prompts import LlmFilterMessagesJinjaPrompt
from sycamore.llms.prompts.prompts import JinjaPrompt
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.query.execution.operations import summarize_data
from sycamore.transforms.summarize import MultiStepDocumentSummarizer, EtCetera, OneStepDocumentSummarizer
from sycamore.llms.llms import LLMMode
from sycamore import MaterializeSourceMode

In [None]:
aryn_test_team_key="<REDACTED>"
docset_id="aryn:ds-bsjwetydnjpjmwwh6vunkno"
materialize_dir = Path("materialize")

In [None]:
context = sycamore.init()
initial_docset = (context.read
                  .aryn(aryn_url="https://test-api.aryn.ai/v1/storage", docset_id=docset_id, aryn_api_key=aryn_test_team_key))

In [None]:
def remove_original_elements(doc: Document):
    del doc.properties['_original_elements']
    return doc

def filter_parent_documents(doc:Document):
    return "parent_id" in doc
    

exploded_docset2 = initial_docset.map(remove_original_elements).materialize(path=materialize_dir / "read-back", source_mode=MaterializeSourceMode.USE_STORED).spread_properties(["earnings_calls"]).explode().filter(filter_parent_documents)
removed_orig_docset = initial_docset.map(remove_original_elements).spread_properties(["earnings_calls"])

In [None]:
exploded_docset2.count()

In [None]:


def writeout(d):
    f = open("alltext.txt", "a")
    f.write(d.text_representation)
    f.write('\n')
    f.close()
    return d

exploded_docset2.map(writeout).filter(lambda x: 'inflation' in x.text_representation.lower()).count()

In [None]:
from sycamore.llms import OpenAI, OpenAIModels
oai = OpenAI(OpenAIModels.GPT_4O)

In [None]:
## Cell to answer question: tell me the number of customers MongoDB had at the end of Q1
entity_extractor = OpenAIEntityExtractor(entity_name="num_customers", llm=oai, num_of_elements=10,
                      field = "text_representation", use_elements=True)
                      

mdb_docset = removed_orig_docset.filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'MDB' and doc.properties['earnings_calls']['quarter']=='Q1').extract_entity(entity_extractor)

In [None]:
mdb_docset.take(1)[0].properties.get("num_customers")

In [None]:
## Cell to answer the question tell me the first earnings call where the VMWare acquisiton was mentioned

# Using llm filter
avgo_docset = (
    removed_orig_docset
    .filter( lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')
    .llm_filter(
        llm=oai,
        new_field="vmware_mentioned_confidence",
        prompt = LlmFilterMessagesJinjaPrompt.fork(filter_question="Does this text mention VMWare?"),
        llm_mode = LLMMode.ASYNC,
        keep_none=True)
    .map(lambda doc: DateTimeStandardizer.standardize(doc, key_path = ["properties","earnings_calls","date"]))
    .sort(descending=False, field="properties.earnings_calls.dateTime")
)
avgo_docset.take(1)[0].properties['earnings_calls']['day']

In [None]:
## Cell to answer the question tell me the first earnings call where the VMWare acquisiton was mentioned
from sycamore.transforms import DateTimeStandardizer

# Using plain filter
def filterVMware(elem: Element) -> bool:
  return "VMware" in elem.text_representation

vwmare_docset_sorted = (removed_orig_docset
                        .filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')
                        .filter_elements(filterVMware)
                        .map(lambda doc: DateTimeStandardizer.standardize(doc, key_path = ["properties","earnings_calls","date"]))
                        .sort(descending=False, field="properties.earnings_calls.dateTime"))
vwmare_docset_sorted.take(1)[0].properties['earnings_calls']['day']

In [None]:
## Cell to answer question summarize the impact of the VMWare acquisiton on Broadcom's earnings
myprompt = JinjaPrompt(
    system="You are a helpful financial assistant.",
    user="""Here's an earnings call. Please answer the question {{ question }}
            {% for elt in doc.elements %}
            {{ elt.text_representation }}
            {% endfor %}""",
    question="Summarize the impact of the vmware acquisition on broadcom's earnings"
)

vmware_acquistion_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'AVGO')
                     .filter_elements(filterVMware)
                     .llm_map(prompt=myprompt, output_field="acquisition_impact_summary", llm=oai, llm_mode=LLMMode.ASYNC))


In [None]:
question = "Summarize the impact of the vmware acquisition on broadcom's earnings. Give a 2 line summary for each quarter."
data_desc = "Broadcom earnings calls"

oai = OpenAI(OpenAIModels.GPT_4O)
oaitk = OpenAITokenizer(OpenAIModels.GPT_4O.value.name, max_tokens=100_000)
summarizer = MultiStepDocumentSummarizer(
    llm=oai, 
    llm_mode=LLMMode.ASYNC, 
    question=question, 
    data_description=data_desc, 
    fields=['acquisition_impact_summary'],
    tokenizer=oaitk
)
summary = summarize_data(
    llm=oai,
    question=question,
    data_description=data_desc,
    input_data=[vmware_acquistion_summary],
    docset_summarizer=summarizer
)
print(summary)

In [None]:
## Cell to answer the question tell me how Intuit is integrating Intuit Assist (their new AI offering) into existing products
intuit_prompt = JinjaPrompt(
    system="You are a helpful financial assistant.",
    user="""Here's an earnings call. Please answer the question {{ question }}
            {% for elt in doc.elements %}
            {{ elt.text_representation }}
            {% endfor %}""",
    question="Summarize how Intuit Assist is being integrated into Intuit's existing products"
)

def filterForAssist(elem: Element) -> bool:
  return "Assist" in elem.text_representation

intuit_assist_summary = (removed_orig_docset.filter(lambda doc: doc.properties['earnings_calls']['company_ticker'] == 'INTU')
                     .filter_elements(filterForAssist)
                     .llm_map(prompt=intuit_prompt, output_field="intuit_assist_summary", llm=oai, llm_mode=LLMMode.ASYNC))

In [None]:
question = "Summarize how Intuit is integrating its AI assistant, Intuit Assist into its existing products. Give a 3 line summary for each quarter on the progress of the integration."
data_desc = "Intuit Assist integration"

oai = OpenAI(OpenAIModels.GPT_4O)
oaitk = OpenAITokenizer(OpenAIModels.GPT_4O.value.name, max_tokens=100_000)
summarizer = MultiStepDocumentSummarizer(
    llm=oai, 
    llm_mode=LLMMode.ASYNC, 
    question=question, 
    data_description=data_desc, 
    fields=['intuit_assist_summary'],
    tokenizer=oaitk
)
summary = summarize_data(
    llm=oai,
    question=question,
    data_description=data_desc,
    input_data=[intuit_assist_summary],
    docset_summarizer=summarizer
)
print(summary)

In [None]:
## Cell to answer the question 'Return all the companies that mentioned inflation and return a count of the number of times inflation was mentioned'

# Using filter for containing the exact word 'inflation'
inflation_mentioned = (exploded_docset2.filter(lambda element: 'inflation' in element.text_representation.lower())
                     .groupby_count(field='properties.earnings_calls.company_name'))
inflation_mentioned.show()

In [None]:
## Answering the same question with a LLMFilter. This is super slow. Need to figure out why. 
im = exploded_docset2.llm_filter(
    llm=oai,
    new_field="inflation_mentioned_confidence",
    prompt = LlmFilterMessagesJinjaPrompt.fork(filter_question="Does this text mention inflation?", use_elements=False),
    llm_mode = LLMMode.ASYNC
).groupby_count(field='properties.earnings_calls.company_name')

counts = [(d.properties['count'], d.properties['key']) for d in im.take_all()]
for c, k in counts:
    print(f"{c}\t|\t{k}")


In [None]:
## Cell to answer the question summarize all the mergers and acquistions that were mentioned
from sycamore.transforms import DateTimeStandardizer


## Cell to answer question summarize the impact of the VMWare acquisiton on Broadcom's earnings
myprompt = JinjaPrompt(
    system="You are a helpful financial assistant.",
    user="""Here's an earnings calls report. Where applicable, please {{ question }}
            {% for elt in doc.elements %}
            {{ elt.text_representation }}
            {% endfor %}""",
    question="summarize the impact of mergers and acquisitions on earnings. If acquisitons are not discussed, leave the field empty."
)

# Using plain filter
def filterAcquisitionOrMerger(elem: Element) -> bool:
  return "acquisition" in elem.text_representation.lower() or "merger" in elem.text_representation.lower() 

acquistion_summary = (removed_orig_docset
    .filter_elements(filterAcquisitionOrMerger)
    .llm_map(prompt=myprompt, output_field="acquisition_impact_summary", llm=oai, llm_mode=LLMMode.ASYNC)
    .llm_filter(
        llm=oai,
        new_field="acquisition_mentioned_confidence",
        prompt = LlmFilterMessagesJinjaPrompt.fork(filter_question="Are there specific financial details mentioned about how an acquisition impacted earnings?"),
        llm_mode = LLMMode.ASYNC,
        field="properties.acquisition_impact_summary",
        keep_none=True)
   
)



In [None]:
question = "For each of the companies mentioned please summarize the impact of mergers and acquisitions on earnings. Give a quarter by quarter summary."
data_desc = "Acquisition Earnings"

oai = OpenAI(OpenAIModels.GPT_4O)
oaitk = OpenAITokenizer(OpenAIModels.GPT_4O.value.name, max_tokens=100_000)
summarizer = MultiStepDocumentSummarizer(
    llm=oai, 
    llm_mode=LLMMode.ASYNC, 
    question=question, 
    data_description=data_desc, 
    fields=["properties.acquisition_impact_summary"],
    tokenizer=oaitk
)
summary = summarize_data(
    llm=oai,
    question=question,
    data_description=data_desc,
    input_data=[acquistion_summary],
    docset_summarizer=summarizer
)
print(summary)