In [None]:
from dotenv import load_dotenv
import os

# add OPENAI_API_KEY to .env
load_dotenv(".env")

import sycamore
from sycamore.evaluation import EvaluationDataPoint
from sycamore.evaluation.pipeline import EvaluationPipeline
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.query import OpenSearchQueryExecutor
from sycamore.evaluation.subtasks import SubtaskExecutor
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.connectors.file.materialized_scan import DocScan
from sycamore.docset import DocSet
import sycamore


## Ingestion

In [None]:
# Using the Amazon 2017 10K PDF

! curl -o amazon_2017_10k.pdf https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/02da2a86-7bb9-4bcb-95ae-4ce27ea5e3bc.pdf

In [None]:
path = "amazon_2017_10k.pdf"
index = "sample0"

openai_llm = OpenAI(OpenAIModels.GPT_4O.value)

hf_model = "sentence-transformers/all-mpnet-base-v2"
tokenizer = HuggingFaceTokenizer(hf_model)
embedder = SentenceTransformerEmbedder(model_name=hf_model, batch_size=100)

context = sycamore.init()

In [None]:
if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a sycamore jupyter container, using opensearch for opensearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for opensearch host")

os_client_args = {
    "hosts": [{"host": opensearch_host, "port": 9200}],
    "http_compress": True,
    "http_auth": ('admin', 'admin'),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 768,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                },
            }
        }
    }
}

In [None]:
ds = (
    context.read.binary(path, binary_format="pdf")
    .partition(partitioner=ArynPartitioner(extract_table_structure=True, threshold=0.35, use_ocr=True), num_gpus=0.1, parallelism=1)
    .regex_replace(COALESCE_WHITESPACE)
    .merge(merger=GreedyTextElementMerger(tokenizer, 512))
    .spread_properties(["path", "company", "year", "doc-type"])
    .explode()
    .embed(embedder=embedder, num_gpus=0.1)
)

In [None]:
ds.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)

## Querying

#### Without Subtasks

In [None]:
# Question taken from FinanceBench

document = EvaluationDataPoint()
document.question = "What is Amazon's FY2017 days payable outstanding (DPO)? DPO is defined as: 365 * (average accounts payable between FY2016 and FY2017) / (FY2017 COGS + change in inventory between FY2016 and FY2017). Round your answer to two decimal places. Address the question by using the line items and information shown within the balance sheet and the P&L statement."
document.ground_truth_answer = "93.86"
document.doc_id = "demo_nosubtask"
document.properties["subtasks_reqd"] = False

input_docset = DocSet(context, DocScan([document]))

In [None]:
os_config = {
    "size": 10,
    "neural_search_k": 200,
    "embedding_model_id": "hlAX5Y8BnK-z0ftijBv_",
    "search_pipeline": "hybrid_rag_pipeline",
    "llm": "gpt-4o",
    "context_window": "10",
}

data = {
    "experiment_name": "sample subtask test",
    "description": "gpt-4o",
    "created_by": "aanyapratapneni",
    "index": index,
    "os_client_args": os_client_args,
    "os_config": os_config,
}

pipeline = EvaluationPipeline(
    index=index,
    os_config=os_config,
    metrics=[],
    query_executor=OpenSearchQueryExecutor(os_client_args),
    embedder=embedder,
    knn_query=True,
)

query_level_metrics = pipeline.execute(input_docset)[0]
data["query_level_data"] = query_level_metrics.take_all()

In [None]:
print (data["query_level_data"][0]["generated_answer"])

#### With Subtasks

In [None]:
year = 2017

document = EvaluationDataPoint()
document.question = "What is Amazon's FY2017 days payable outstanding (DPO)? DPO is defined as: 365 * (average accounts payable between FY2016 and FY2017) / (FY2017 COGS + change in inventory between FY2016 and FY2017). Round your answer to two decimal places. Address the question by using the line items and information shown within the balance sheet and the P&L statement."
document.ground_truth_answer = "93.86"
document.doc_id = "demo_subtask"
document.properties["subtasks_reqd"] = True
document.properties["subtask_filters"] = {"company": "Amazon", "year": year, "year1": year - 1, "year2": year - 2}

input_docset = DocSet(context, DocScan([document]))

In [None]:
subtask_info = {
    "task_descriptions": {
        "INVENTORY_TURNOVER": "Questions about inventory turnover ratio",
        "DAYS_PAYABLE": "Questions about days payable outstanding",
        "2YR_GROSS_MARG": "Questions about gross margin profile relative to the previous year",
        "CASH_CONV_CYCLE": "Questions about cash conversion cycle",
        "FREE_CASH_FLOW": "Questions about free cash flow",
        "CAPEX": "Questions about only the capital expenditure of a company and no additional information or metrics",
    },

    "task_formulas": {
        "INVENTORY_TURNOVER": {
            "formulas": ["COGS0/INV0"]
        },
        "DAYS_PAYABLE": {
            "formulas": ["365*AVERAGE(ACCPAY0, ACCPAY1)/(COGS0+INV0-INV1)"]
        },
        "2YR_GROSS_MARG": {
            "formulas": ["CURRENT YEAR GROSS MARGIN = ((REV0-COGS0)/REV0)", "PREVIOUS YEAR GROSS MARGIN = ((REV1-COGS1)/REV1)"],
            "instructions": "Use these values to answer the following question."
        },
        "CASH_CONV_CYCLE": {
            "formulas": ["[365*(AVERAGE(INV0, INV1)/(COGS0))] + [365*(AVERAGE(TRADEREC0, TRADEREC1)/(REV0))] - [365*(AVERAGE(ACCPAY0, ACCPAY1)/(COGS0+INV0-INV1))]"]
        },
        "FREE_CASH_FLOW": {
            "formulas": ["CASHOPS-CAPEX0"]
        },
        "CAPEX": {
            "instructions": "Synonyms for capital expenditure are: capital spending; purchases of property, plant, and equipment (PP&E); acquisition expenses."
        }
    },

    "subtask_instructions": {
        "REV0": " What is the total revenue of {company} in {year}? This value can be found on the income statement.",
        "REV1": " What is the total revenue of {company} in {year1}? This value can be found on the income statement.",
        "REV2": " What is the total revenue of {company} in {year2}? This value can be found on the income statement.",
        "CAPEX0": "What is the capital expenditure of {company} in {year}? This value can be found in the cash flow statement. Synonyms for capital expenditure are: capital spending; purchases of property, plant, and equipment (PP&E); acquisition expenses.",
        "INV0": "What is the total inventories of {company} in {year}? This value can be found on the consolidated balance sheet.",
        "INV1": "What is the total inventories of {company} in {year1}? This value can be found on the consolidated balance sheet.",
        "COGS0": "What is the total cost of goods sold of {company} for {year}? This value can be found on the income statements. Cost of goods sold is also referred to as 'total cost of revenue' or 'cost of sales'.",
        "COGS1": "What is the total cost of goods sold of {company} for {year1}? This value can be found on the income statements. Cost of goods sold is also referred to as 'total cost of revenue' or 'cost of sales'.",
        "ACCPAY0": "What is the accounts payable for {company} in {year}? This value can be found on the balance sheet.",
        "ACCPAY1": "What is the accounts payable for {company} in {year1}? This value can be found on the balance sheet.",
        "TRADEREC0": "What is the trade receivables for {company} in {year}? This value can be found on the balance sheet.",
        "TRADEREC1": "What is the trade receivables for {company} in {year1}? This value can be found on the balance sheet.",
        "CASHOPS": "What is the cash from operations for {company} in {year}? This value can be found on the cash flow statement.",
    }
}

In [None]:
subtask_exec = SubtaskExecutor(
    None, subtask_info, index, os_config, query_executor=OpenSearchQueryExecutor(os_client_args), embedder=embedder, knn_query=True, llm=openai_llm
)

subtask_docs = subtask_exec.execute(input_docset)
subtask_docs

pipeline = EvaluationPipeline(
    index=index,
    os_config=os_config,
    metrics=[],
    query_executor=OpenSearchQueryExecutor(os_client_args),
    embedder=embedder,
    knn_query=True,
    subtask_docs=subtask_docs
)

query_level_metrics = pipeline.execute(input_docset)[0]
data["query_level_data"] = query_level_metrics.take_all()

In [None]:
print (data["query_level_data"][0]["generated_answer"])