In [19]:
from sycamore.data import Document
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from sycamore.transforms.merge_elements import GreedySectionMerger
from sycamore.utils.time_trace import timetrace
from sycamore.utils.cache import Cache
import sycamore

import pickle
import json
import sys

ctx = sycamore.init()
llm = OpenAI(OpenAIModels.GPT_4O.value)

In [2]:
@timetrace("LLMGen")
def llm_generate_with_retries(llm, prompt_kwargs, llm_kwargs, max_retries=5):
    for attempt in range(max_retries):
        try:
            llm_response = llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=llm_kwargs).content
            new_props = json.loads(llm_response)
            return new_props
        except Exception as e:
            print(e)
            if attempt == max_retries - 1:
                raise e

"""
For table elements, ask the llm to to extract a JSON formatted key-value object from the table's csv string.
"""
@timetrace("ExtractProp")
def extract_table_as_properties(doc: Document) -> Document:
    PROMPT = """
    You are given a csv representing either a single column, or multi-column table.
    Instructions:
    1. Parse the table and return a flattened JSON object representing the key-value pairs of properties defined in the table.
    2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types are numbers, strings, and lists.
    3. If you find multiple fields defined in a row, feel free to split them into separate properties.
    4. Use camelCase for the key names
    5. For fields where the values are in standard measurement units like miles, nautical miles, knots, celsius
       - include the unit in the key name and only set the numeric value as the value.
       - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, "Temperature: 3°C" should become temperatureInC: 3
    """
    llm_kwargs = {
        "response_format":{ "type": "json_object" }
    }
    sys.stderr.write(doc.properties['path'])
    if not doc.elements:
        return
    # we are going to use the first table's properties as document level properties
    top_level_table = None
    for element in doc.elements:
        if element.type != "table" or element.table == None:
            continue
        if not top_level_table:
            top_level_table = element
        prompt = PROMPT
        prompt += "\n" + element.table.to_csv()
        prompt_kwargs = {
            "prompt": prompt
        }
        new_props = llm_generate_with_retries(llm, prompt_kwargs, llm_kwargs, max_retries=5)
        if new_props:
            element.properties.update(new_props)
        else:
            element.properties.update({"Foo": "Bar"})
            
    doc.properties["entity"] = top_level_table.properties.copy()
    return doc

In [None]:
s3_path = "s3://aryn-datasets-us-east-1/financebench-large/3M_1/"
docset = (
    ctx.read.binary(s3_path, binary_format="pdf")
    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=True, extract_images=True), num_gpus=0.1)
    .regex_replace(COALESCE_WHITESPACE)
    .map()
)

extracted = []

for d in docset.take_all():
    extracted.append({"properties": d.properties,
                      "elements": d.elements
                      })

pickle.dump(extracted, open('extracted_1.pickle', 'wb'))

In [None]:
for i, e in enumerate(extracted[0]["elements"]):
    if i >= 100:
        break
    print(e)

In [16]:
s3_path = "s3://aryn-datasets-us-east-1/ntsb-large/100/193893.pdf"
docset = (
    ctx.read.binary(s3_path, binary_format="pdf")
    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, use_ocr=True, extract_images=True), num_gpus=0.1)
    .regex_replace(COALESCE_WHITESPACE)
)

In [None]:
from sycamore.utils.pdf_utils import show_pages
sample_pages = show_pages(docset, limit=20)