In [None]:
# First, let's explore the data. By now you should have downloaded some pdf files, so let's look at one of them.
# This also tests that you have poppler installed.
from error_messages import *
from IPython.display import display
from pathlib import Path
import os

repo_root = Path.cwd()
pdf_dir = repo_root / "files" / "earnings_calls"
one_pdf = pdf_dir / "broadcom-avgo-q1-2024-earnings-call-transcript.pdf"
one_pdf

In [None]:
from pdf2image import convert_from_path
## print the first two pages of the broadcom earnings call.
try:
    ims = convert_from_path(one_pdf)
    display(ims[0])
    display(ims[1])
except Exception as e:
    poppler_failed_error()
    print(e)

You'll notice that this earnings call is a transcript of a conversation between several different people. This document specifically focuses on Broadcom's earnings in Q1 in 2024. In this particular quarter, Broadcom's VMware acquisition is a hot topic and analysts are asking the CEO (Hock Tan) and the CFO (Kristen Spears) about Broadcom's strategy behind the VMware acquisiton. The rest of this notebook will walk you through how to use Aryn to programatically answer the following question:

1. What details did the CFO, Kristen Spears, discuss about the VMware acqusition?

## Partitioning

As you probably noticed, the PDF is just a bunch of unstructured text that's not easy to process. If we want to answer this question (and many others), the first step is to turn this document into a more structured format. Here, we'll use Aryn DocParse which breaks a document up into cohesive _elements_. There are a number of strategies for doing this, but we've found success with visual models. For this workshop we'll use Aryn DocParse for this first step. 

In [None]:
# Get started with aryn_sdk. 
# This will also make sure your credentials are set correctly.
from aryn_sdk.partition import partition_file

try:
    data = partition_file(one_pdf)
    elements = data['elements']
except Exception as e:
    aryn_no_api_key_error()
    print(e)

In [None]:
# We can visualize the elements by drawing the bounding boxes onto the pdf. aryn_sdk has a function for that.
from aryn_sdk.partition import draw_with_boxes

graffitied_pages = draw_with_boxes(one_pdf, data)
display(graffitied_pages[0])

Here, we've printed page 1 of the Broadcom earnings call. If you scroll through, you'll notice several bounding boxes that denote the elements that DocParse detected. Each element contains a bunch of information. Core information includes `type`, `bbox`, and `text_representation`. Additional information is stored in a `properties` dict, such as the page number the element is on. Let's look at the JSON representation of the first element that DocParse detected. 

In [None]:
print(elements[0])

You'll notice that DocParse detected an image at the top of the page and it returned some information about that element such as its bounding box etc. Let's have a quick quiz to introduce elements. I've created a bunch of functions that operate on the list of elements returned by the partitioner. Your job is to implement them.

In [None]:
def number_of_page_headers(elts: list[dict]) -> int:
    """Return the number of elements of type 'Page-header'"""

    raise NotImplementedError("Finish this yourself")
        
    
def number_of_elements_after_page_4(elts: list[dict]) -> int:
    """Return the number of elements that fall after page 4. Note that page numbers are 1-indexed."""
    
    raise NotImplementedError("Finish this yourself")

def number_of_vmware_mentions(elts: list[dict]) -> int:
    """Return the number of elements that mention 'vmware' (this is case insensitive, so count 'VMware' and 'vmware')
    Note: some elements do not have a 'text_representation' key."""
    
    raise NotImplementedError("Finish this yourself")
    

def number_of_elements_that_cover_a_third_of_the_page(elts: list[dict]) -> int:
    """For this you'll need the bbox property. bboxes are represented as 4 floats, [x1, y1, x2, y2]. Each 
    coordinate ranges from 0 to 1, representing the fraction of the page (left-to-right for x, top-to-bottom for y) 
    where the point lies. So [0, 0, 1, 1] is the whole page, and [0, 0.5, 0.5, 1] is the lower-left quadrant.
    
    Return the number of elements that cover at least a third of the page. An element covers a third of the page if its 
    area is greater than 1/3"""

    raise NotImplementedError("Finish this yourself")


assert number_of_page_headers(elements) == 2, f"Got {number_of_page_headers(elements)}. Make sure your capitalization is correct."

assert number_of_elements_after_page_4(elements) == 232, f"Got {number_of_elements_after_page_4(elements)}. If you got 241, 'after page 4' does not include page 4, and page numbers are 1-indexed."

assert number_of_vmware_mentions(elements) == 24, f"Got {number_of_vmware_mentions(elements)}. A 'vwmare mention' is defined as an element whose text contains the string 'VMware'."

assert number_of_elements_that_cover_a_third_of_the_page(elements) == 1, f"Got {number_of_elements_that_cover_a_third_of_the_page(elements)}"

## Sycamore basics

By now you have a basic sense of the data model - a Document is made up of Elements which represent logical chunks of the Document, and contain additional metadata about themselves.
The next step is to scale this past one document to many, and this is where Sycamore comes in. Sycamore adds a data structure called a DocSet, which is a set of Documents.
Each Document in the DocSet contains the list of Elements that it comprises, and a bunch of metadata as well (for instance, the name of the file the document came from).

Now let's use Sycamore to read the Broadcom earnings call into a  DocSet.

In [None]:
import sycamore
context = sycamore.init()
pdf_docset = context.read.binary(paths=str(one_pdf), binary_format="pdf")

# Let's see what that gave us
pdf_docset.show()

Our docset has a single Document in it, with a 'properties' dict containing some metadata, an 'elements' list containing an empty list of elements, a doc_id, lineage_id, type, and binary_representation, which contains the binary of the original PDF. Now let's use sycamore to send the PDFs to DocParse and get the elements as we did before. To do so, we'll run the `partition` transform.

In [None]:
from sycamore.transforms.partition import ArynPartitioner

# If you did not see the error message about API keys, ignore this comment.
# You might need to add aryn_api_key="<YOUR KEY>" if the environment didn't pick it up correctly. 
partitioned_docset = pdf_docset.partition(ArynPartitioner())

# We'll limit the number of elements to show because otherwise this produces an obnoxiously large output cell
partitioned_docset.show(num_elements=3)

In [None]:
#Let's view the bounding boxes drawn on the pdf using sycamore
from sycamore.utils.pdf_utils import show_pages

show_pages(partitioned_docset)

Now to answer our questions about the discussion in the earnings calls, you'll notice that only certain elements actually matter. In fact, outside of the `Section-Header` and `Text` elements, we can just filter out all the other elements. What you may have noticed is we're now just applying a set of successive tranforms to our DocSet after loading it in. First, we partitioned it and now we're filtering some elements. In fact the recommended way to work with Sycamore is to write a pipeline of transforms on a DocSet as you process it. So now let's apply a `filter_elements` transform on our partitioned_docset to remove all the unnecessary elements.

In [None]:
from sycamore.data import Element

def filter_in_discussions(elt: Element) -> bool:
    return elt.type in ("Section-header", "Text")

# docset.filter_elements takes a predicate function that maps Elements to bools. 
# For each element in a document, keep the element only if predicate(element) is True.
filtered_ds = partitioned_docset.filter_elements(filter_in_discussions)

### Chunking
Great! Now we've introduced the Data Model of Sycamore and discussed how to use it to call DocParse to break up a complex document into its separate components. Let's now go back to a particular question we were trying to answer: "What details did the CFO, Kisten Spears, discuss about the VMware acqusition?" If you inspect page 4 you'll notice that in the last element that DocParse detected, Spears actually mentions the answer. 

In [None]:
display(graffitied_pages[3])

Now, for our question answering system to be able to detect that this is element where Kristen Spears discusses the VMWare acquistion, we'll need a way to associate the "speaker element" that is a few paragraphs above it, with this last element. The way to do that is through chunking. Sycamore implements a number of chunking strategies (documentation [here](https://sycamore.readthedocs.io/en/stable/sycamore/APIs/low_level_transforms/merge_elements.html)). 
For this workshop we will use the `MarkedMerger` as it is the most customizable.

As mentioned before, to be able to answer questions like the one about Kristen Spears we'll chunk such tht for each speaker 'block' we  get a chunk. In our partitioning we have split the text into paragraphs, but we'd like 
to squish all those paragraphs together, breaking the blocks wherever there's a new speaker. With a little bit of effort we can detect the lines that introduce speakers with regexes - one for external
speakers and one for internal speakers, as the formatting is very consistent (this applies across all the documents in the dataset, don't worry):

```python
external_re = '([^ ]*[^\S\n\t]){1,4}--[^\S\n\t].*--' # A name (1-4 words long) followed by -- followed by anything followed by --
internal_re = '([^ ]*[^\S\n\t]){1,4}--.*'            # A name (1-4 words long) followed by -- followed by anything
```

The `MarkedMerger` is set up perfectly to work with this. It will step through the elements, merging them together one by one, unless it sees one of two 'marks' in the data:

- on a "_drop" mark it drops the element and continues merging
- on a "_break" mark it finalizes the merged element and uses this one to start merging up a new element

In [None]:
import re
from sycamore.transforms.merge_elements import MarkedMerger


def markSpeakers(elt: Element) -> Element:
    if not elt.text_representation:
        return elt

    external_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--[^\S\n\t].*--', elt.text_representation)
    internal_speaker = re.match('([^ ]*[^\S\n\t]){1,4}--.*', elt.text_representation)
    if elt.text_representation.strip() == 'Operator':
        elt.properties['speaker_name'] = 'Operator'
        elt.properties['speaker_role'] = 'Operator'
        elt.properties['speaker'] = True
        elt.data["_break"] = True
    elif external_speaker:
        location = elt.text_representation.find('--')
        location2 = location + elt.text_representation[location+2:].find('--')
        elt.properties['speaker_name'] = elt.text_representation[:location].strip()
        elt.properties['speaker_external_org'] = elt.text_representation[location+2:location2+1].strip()
        elt.properties['speaker_role'] = elt.text_representation[location2+4:].strip()
        elt.properties['speaker'] = True
        elt.data["_break"] = True
    elif internal_speaker:
        location = elt.text_representation.find('--')
        elt.properties['speaker_name'] = elt.text_representation[:location].strip()
        elt.properties['speaker_role'] = elt.text_representation[location+2:].strip()
        elt.properties['speaker'] = True
        elt.data["_break"] = True
    return elt

speakers_marked_ds = filtered_ds.map_elements(markSpeakers)
merged_ds = speakers_marked_ds.merge(MarkedMerger())

Now to answer the question "What details did the CFO, Kirsten Spears, discuss about the VMware acqusition?" let's run a simple filter_elements transform on the last docset.

In [None]:
merged_ds.show()

In [None]:
ks_acquisition = merged_ds.filter_elements(lambda e: 
                          ('speaker' in e.properties and e.properties['speaker_name'] == 'Kirsten Spears' and 'vmware' in e.text_representation.lower()))
ks_acquisition.show()

In [None]:
## Given that it's just 1 element let's see if the VMware acquisition shows up!
ks_acquisition.take_all()[0]['elements'][0].text_representation