In [1]:
from dotenv import load_dotenv

load_dotenv(".env")

import json

import sycamore
from sycamore.connectors.file.file_writer import JSONEncodeWithUserDict
from sycamore.evaluation import EvaluationDataPoint
from sycamore.evaluation.pipeline import EvaluationPipeline
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.query import OpenSearchQueryExecutor
from sycamore.evaluation.subtasks import SubtaskExecutor
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.connectors.file.materialized_scan import DocScan
from sycamore.docset import DocSet
import sycamore

from ray.data import ActorPoolStrategy

## Ingestion

In [None]:
# Using the Amazon 2017 10K PDF

! curl -o amazon_2017_10k.pdf https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/02da2a86-7bb9-4bcb-95ae-4ce27ea5e3bc.pdf

In [2]:
path = "amazon_2017_10k.pdf"
index = "sample0"

openai_llm = OpenAI(OpenAIModels.GPT_4O.value)

hf_model = "sentence-transformers/all-mpnet-base-v2"
tokenizer = HuggingFaceTokenizer(hf_model)
embedder = SentenceTransformerEmbedder(model_name=hf_model, batch_size=100)

context = sycamore.init()

2024-08-08 20:37:28,820	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


In [3]:
os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ('admin', 'admin'),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 768,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                },
            }
        }
    }
}

In [4]:
ds = (
    context.read.binary(path, binary_format="pdf")
    .partition(partitioner=ArynPartitioner(extract_table_structure=True, threshold=0.35, use_ocr=True), num_gpus=0.1, compute=ActorPoolStrategy(size=1))
    .regex_replace(COALESCE_WHITESPACE)
    .merge(merger=GreedyTextElementMerger(tokenizer, 512))
    .spread_properties(["path", "company", "year", "doc-type"])
    .explode()
    .embed(embedder=embedder, num_gpus=0.1)
)

In [6]:
ds.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)

(pid=202065) INFO:root:Spurious log 1: Verifying that log messages are propogated
(pid=202194) INFO:root:Spurious log 1: Verifying that log messages are propogated [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)
(MapWorker(Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)) pid=202194) INFO:sycamore.utils.time_trace:loading_model wall:  4.599 user:  0.923sys:  0.172 rss_mib: 763.176
(MapWorker(Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)) pid=202194) INFO:sycamore.utils.time_trace:load_model_start wall:  4.599 user:  0.923sys:  0.172 rss_mib: 763.176
(MapWorker(Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)) pid=202194) The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longe

2024-08-08 20:09:10,876	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-08_20-09-00_792061_176511/logs/ray-data
2024-08-08 20:09:10,876	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)] -> TaskPoolMapOperator[MapBatches(regex_replace)->MapBatches(merge_elements)->MapBatches(spread_properties)->MapBatches(explode)] -> ActorPoolMapOperator[MapBatches(BaseMapTransformCallable__SentenceTransformerEmbedder)] -> TaskPoolMapOperator[MapBatches(OsrchWrite)]


- ReadBinary->SplitBlocks(16) 1:   0%|          | 0/1 [00:00<?, ?it/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2:   0%|          | 0/1 [00:00<?, …

- MapBatches(regex_replace)->MapBatches(merge_elements)->MapBatches(spread_properties)->MapBatches(explode) 3:…

- MapBatches(BaseMapTransformCallable__SentenceTransformerEmbedder) 4:   0%|          | 0/1 [00:00<?, ?it/s]

- MapBatches(OsrchWrite) 5:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

## Querying

#### Without Subtasks

In [29]:
# Question taken from FinanceBench

document = EvaluationDataPoint()
document.question = "What is Amazon's FY2017 days payable outstanding (DPO)? DPO is defined as: 365 * (average accounts payable between FY2016 and FY2017) / (FY2017 COGS + change in inventory between FY2016 and FY2017). Round your answer to two decimal places. Address the question by using the line items and information shown within the balance sheet and the P&L statement."
document.ground_truth_answer = "93.86"
document.doc_id = "demo_nosubtask"
document.properties["subtasks_reqd"] = False

input_docset = DocSet(context, DocScan([document]))

In [30]:
os_config = {
    "size": 10,
    "neural_search_k": 200,
    "embedding_model_id": "hlAX5Y8BnK-z0ftijBv_",
    "search_pipeline": "hybrid_rag_pipeline",
    "llm": "gpt-4o",
    "context_window": "10",
}

data = {
    "experiment_name": "sample subtask test",
    "description": "gpt-4o",
    "created_by": "aanyapratapneni",
    "index": index,
    "os_client_args": os_client_args,
    "os_config": os_config,
}

pipeline = EvaluationPipeline(
    index=index,
    os_config=os_config,
    metrics=[],
    query_executor=OpenSearchQueryExecutor(os_client_args),
    embedder=embedder,
    knn_query=True,
)

query_level_metrics = pipeline.execute(input_docset)[0]
data["query_level_data"] = query_level_metrics.take_all()

2024-08-08 20:58:24,030	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-08_20-37-25_809660_203369/logs/ray-data
2024-08-08 20:58:24,030	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(_build_opensearch_query)] -> TaskPoolMapOperator[MapBatches(query)] -> TaskPoolMapOperator[MapBatches(_process_queries)]


- MapBatches(_build_opensearch_query) 1:   0%|          | 0/1 [00:00<?, ?it/s]

- MapBatches(query) 2:   0%|          | 0/1 [00:00<?, ?it/s]

- MapBatches(_process_queries) 3:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

COMPLETE


[{'properties': {'subtasks_reqd': False}, 'elements': [], 'lineage_id': '857762ae-7612-4e99-a230-5eb3ba1046e6', 'type': 'EvaluationDataPoint', 'question': "What is Amazon's FY2017 days payable outstanding (DPO)? DPO is defined as: 365 * (average accounts payable between FY2016 and FY2017) / (FY2017 COGS + change in inventory between FY2016 and FY2017). Round your answer to two decimal places. Address the question by using the line items and information shown within the balance sheet and the P&L statement.", 'ground_truth_answer': '93.86', 'doc_id': 'demo_q1', 'index': 'sample0', 'query': {'_source': {'excludes': ['embedding']}, 'size': 10, 'query': {'knn': {'embedding': {'vector': [0.018365908414125443, -0.032008253037929535, -0.04999585822224617, -0.015410860069096088, 0.043805524706840515, 0.03495189547538757, 0.020780600607395172, -0.030526503920555115, 0.023441841825842857, -0.008030596189200878, 0.044345673173666, 0.01119137927889824, 0.04476682469248772, 0.04529480263590813, -0.0

In [32]:
print (data["query_level_data"][0]["generated_answer"])

To calculate Amazon's FY2017 Days Payable Outstanding (DPO), we need the following information:

1. **Accounts Payable (AP) for FY2016 and FY2017**:
   - AP for FY2016: $16,459 million
   - AP for FY2017: $22,481 million

2. **Cost of Goods Sold (COGS) for FY2017**:
   - COGS for FY2017: $118,573 million

3. **Inventory for FY2016 and FY2017**:
   - Inventory for FY2016: $11,461 million
   - Inventory for FY2017: $16,047 million

4. **Change in Inventory**:
   - Change in Inventory = Inventory for FY2017 - Inventory for FY2016
   - Change in Inventory = $16,047 million - $11,461 million = $4,586 million

5. **Average Accounts Payable**:
   - Average AP = (AP for FY2016 + AP for FY2017) / 2
   - Average AP = ($16,459 million + $22,481 million) / 2 = $19,470 million

6. **Total COGS + Change in Inventory**:
   - Total = COGS for FY2017 + Change in Inventory
   - Total = $118,573 million + $4,586 million = $123,159 million

7. **DPO Calculation**:
   - DPO = 365 * (Average AP / Total COGS

#### With Subtasks

In [33]:
year = 2017

document = EvaluationDataPoint()
document.question = "What is Amazon's FY2017 days payable outstanding (DPO)? DPO is defined as: 365 * (average accounts payable between FY2016 and FY2017) / (FY2017 COGS + change in inventory between FY2016 and FY2017). Round your answer to two decimal places. Address the question by using the line items and information shown within the balance sheet and the P&L statement."
document.ground_truth_answer = "93.86"
document.doc_id = "demo_subtask"
document.properties["subtasks_reqd"] = True
document.properties["subtask_filters"] = {"company": "Amazon", "year": year, "year1": year - 1, "year2": year - 2}

input_docset = DocSet(context, DocScan([document]))

In [34]:
subtask_info = {
    "task_descriptions": {
        "INVENTORY_TURNOVER": "Questions about inventory turnover ratio",
        "DAYS_PAYABLE": "Questions about days payable outstanding",
        "2YR_GROSS_MARG": "Questions about gross margin profile relative to the previous year",
        "CASH_CONV_CYCLE": "Questions about cash conversion cycle",
        "FREE_CASH_FLOW": "Questions about free cash flow",
        "CAPEX": "Questions about only the capital expenditure of a company and no additional information or metrics",
    },

    "task_formulas": {
        "INVENTORY_TURNOVER": {
            "formulas": ["COGS0/INV0"]
        },
        "DAYS_PAYABLE": {
            "formulas": ["365*AVERAGE(ACCPAY0, ACCPAY1)/(COGS0+INV0-INV1)"]
        },
        "2YR_GROSS_MARG": {
            "formulas": ["CURRENT YEAR GROSS MARGIN = ((REV0-COGS0)/REV0)", "PREVIOUS YEAR GROSS MARGIN = ((REV1-COGS1)/REV1)"],
            "instructions": "Use these values to answer the following question."
        },
        "CASH_CONV_CYCLE": {
            "formulas": ["[365*(AVERAGE(INV0, INV1)/(COGS0))] + [365*(AVERAGE(TRADEREC0, TRADEREC1)/(REV0))] - [365*(AVERAGE(ACCPAY0, ACCPAY1)/(COGS0+INV0-INV1))]"]
        },
        "FREE_CASH_FLOW": {
            "formulas": ["CASHOPS-CAPEX0"]
        },
        "CAPEX": {
            "instructions": "Synonyms for capital expenditure are: capital spending; purchases of property, plant, and equipment (PP&E); acquisition expenses."
        }
    },

    "subtask_instructions": {
        "REV0": " What is the total revenue of {company} in {year}? This value can be found on the income statement.",
        "REV1": " What is the total revenue of {company} in {year1}? This value can be found on the income statement.",
        "REV2": " What is the total revenue of {company} in {year2}? This value can be found on the income statement.",
        "CAPEX0": "What is the capital expenditure of {company} in {year}? This value can be found in the cash flow statement. Synonyms for capital expenditure are: capital spending; purchases of property, plant, and equipment (PP&E); acquisition expenses.",
        "INV0": "What is the total inventories of {company} in {year}? This value can be found on the consolidated balance sheet.",
        "INV1": "What is the total inventories of {company} in {year1}? This value can be found on the consolidated balance sheet.",
        "COGS0": "What is the total cost of goods sold of {company} for {year}? This value can be found on the income statements. Cost of goods sold is also referred to as 'total cost of revenue' or 'cost of sales'.",
        "COGS1": "What is the total cost of goods sold of {company} for {year1}? This value can be found on the income statements. Cost of goods sold is also referred to as 'total cost of revenue' or 'cost of sales'.",
        "ACCPAY0": "What is the accounts payable for {company} in {year}? This value can be found on the balance sheet.",
        "ACCPAY1": "What is the accounts payable for {company} in {year1}? This value can be found on the balance sheet.",
        "TRADEREC0": "What is the trade receivables for {company} in {year}? This value can be found on the balance sheet.",
        "TRADEREC1": "What is the trade receivables for {company} in {year1}? This value can be found on the balance sheet.",
        "CASHOPS": "What is the cash from operations for {company} in {year}? This value can be found on the cash flow statement.",
    }
}

In [35]:
subtask_exec = SubtaskExecutor(
    None, subtask_info, index, os_config, query_executor=OpenSearchQueryExecutor(os_client_args), embedder=embedder, knn_query=True, llm=openai_llm
)

subtask_docs = subtask_exec.execute(input_docset)
subtask_docs

pipeline = EvaluationPipeline(
    index=index,
    os_config=os_config,
    metrics=[],
    query_executor=OpenSearchQueryExecutor(os_client_args),
    embedder=embedder,
    knn_query=True,
    subtask_docs=subtask_docs
)

query_level_metrics = pipeline.execute(input_docset)[0]
data["query_level_data"] = query_level_metrics.take_all()

2024-08-08 20:59:20,539	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-08_20-37-25_809660_203369/logs/ray-data
2024-08-08 20:59:20,539	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc)]


- MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) 1:   0%|          | 0/1 [00:00…

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.HTML object>ches(_get_subtasks)->MapBatches(process_doc) pid=204151) 
<IPython.core.display.HTML object>ches(_get_subtasks)->MapBatches(process_doc) pid=204151) 


(MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) pid=204151) INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"


<IPython.core.display.HTML object>ches(_get_subtasks)->MapBatches(process_doc) pid=204151) 
<IPython.core.display.HTML object>ches(_get_subtasks)->MapBatches(process_doc) pid=204151) 


(MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) pid=204151) INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
(MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) pid=204151) INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.15it/s]>MapBatches(process_doc) pid=204151) 
(MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) pid=204151) INFO:opensearch:POST https://localhost:9200/sample0/_search?search_pipeline=hybrid_rag_pipeline [status:200 request:0.690s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s]btasks)->MapBatches(process_doc) pid=204151) 
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.27it/s]>MapBatches(process_doc) pid=204151) 
(MapBatches(_get_formulas)->MapBatches(_get_subtasks)->MapBatches(process_doc) pid=204151) INFO:opensearch:POST https://localho

2024-08-08 20:59:24,896	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-08_20-37-25_809660_203369/logs/ray-data
2024-08-08 20:59:24,896	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(_build_opensearch_query)] -> TaskPoolMapOperator[MapBatches(query)] -> TaskPoolMapOperator[MapBatches(_process_queries)]


- MapBatches(_build_opensearch_query) 1:   0%|          | 0/1 [00:00<?, ?it/s]

- MapBatches(query) 2:   0%|          | 0/1 [00:00<?, ?it/s]

- MapBatches(_process_queries) 3:   0%|          | 0/1 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

COMPLETE
FINAL_DUMPED


In [36]:
print (data["query_level_data"][0]["generated_answer"])

To calculate Amazon's FY2017 Days Payable Outstanding (DPO), follow these steps:

1. **Calculate the average accounts payable:**
   \[
   \text{Average Accounts Payable} = \frac{\text{ACCPAY0} + \text{ACCPAY1}}{2} = \frac{34,616 + 25,309}{2} = 29,962.5 \text{ million}
   \]

2. **Calculate the change in inventory:**
   \[
   \text{Change in Inventory} = \text{INV0} - \text{INV1} = 16,047 - 11,461 = 4,586 \text{ million}
   \]

3. **Calculate the denominator (COGS + change in inventory):**
   \[
   \text{Denominator} = \text{COGS0} + \text{Change in Inventory} = 111,934 + 4,586 = 116,520 \text{ million}
   \]

4. **Calculate the DPO:**
   \[
   \text{DPO} = 365 \times \frac{\text{Average Accounts Payable}}{\text{Denominator}} = 365 \times \frac{29,962.5}{116,520} \approx 93.79 \text{ days}
   \]

Therefore, Amazon's FY2017 Days Payable Outstanding (DPO) is approximately **93.79 days**.
