In [None]:
!pip install sycamore-ai[opensearch]
# DocPrep code uses the Sycamore document ETL library: https://github.com/aryn-ai/sycamore 

In [None]:
!apt-get install poppler-utils

In [None]:
import pyarrow.fs
import sycamore
import json
import os
from opensearchpy import OpenSearch
from sycamore.functions.tokenizer import OpenAITokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.embed import OpenAIEmbedder
from sycamore.materialize_config import MaterializeSourceMode
from sycamore.utils.pdf_utils import show_pages
from sycamore.transforms.summarize_images import SummarizeImages
from sycamore.context import ExecMode

In [None]:
# It's best to store API keys in a configuration file or set them as environment variables.  
# For quick testing, you can define them here:
#
# os.environ["ARYN_API_KEY"] = "YOUR_ARYN_API_KEY"
# os.environ["OS_USER_NAME"] = "YOUR_OPENSEARCH_USER_NAME"
# os.environ["OS_PASSWORD"] = "YOUR_OPENSEARCH_PASSWORD"
# os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [None]:
# Sycamore uses lazy execution for efficiency, so the ETL pipeline will only execute when running cells with specific functions.
  
paths = ["s3://aryn-public/ntsb/59.pdf"]
# Configure your AWS credentials here if the bucket is private
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)
# Initialize the Sycamore context
ctx = sycamore.init(ExecMode.LOCAL)
# Set the embedding model and its parameters
model_name = "text-embedding-3-small"
max_tokens = 8191
dimensions = 1536
# Initialize the tokenizer
tokenizer = OpenAITokenizer(model_name)

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Partition and extract tables and images
    .partition(partitioner=ArynPartitioner(
        threshold="auto",
        use_ocr=True,
        extract_table_structure=True,
        extract_images=True
    ))
    # Use materialize to cache output. If changing upstream code or input files, change setting from USE_STORED to RECOMPUTE to create a new cache.
    .materialize(path="./materialize/partitioned", source_mode=MaterializeSourceMode.USE_STORED)
    # Merge elements into larger chunks
    .merge(merger=GreedyTextElementMerger(
      tokenizer=tokenizer,  max_tokens=max_tokens, merge_across_pages=False
    ))
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=max_tokens)
)

ds.execute()

# Display the first 3 pages after chunking
show_pages(ds, limit=3)


In [None]:
embedded_ds = (
    # Copy document properties to each Document's sub-elements
    ds.spread_properties(["path", "entity"])
    # Convert all Elements to Documents
    .explode() 
    # Embed each Document. You can change the embedding model. Make your target vector index matches this number of dimensions.
    .embed(embedder=OpenAIEmbedder(model_name=model_name))
)
# To know more about docset transforms, please visit https://sycamore.readthedocs.io/en/latest/sycamore/transforms.html


In [None]:
index_name = "docprep-test-index-2"
# Configure the OpenSearch client arguments
os_client_args = {
    "hosts": [{"host": "search-aryn-blog-test-kmzf2omtmydwhsnhov6xlj7y5m.us-east-1.es.amazonaws.com", "port": 443}],
    "http_auth": (os.getenv("OS_USER_NAME"), os.getenv("OS_PASSWORD")),
    "verify_certs": False,
    "use_ssl": True,
}

# Configure the settings and mappings for the OpenSearch index
index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
        },
        "mappings": {
            "properties": {
                "embedding": {
                    "type": "knn_vector",
                    "dimension": dimensions,
                    "method": {"name": "hnsw", "engine": "faiss"},
                },
            },
        },
    },
}

# Write the docset to the specified OpenSearch index
embedded_ds.write.opensearch(
    os_client_args=os_client_args,
    index_name=index_name,
    index_settings=index_settings,
)

In [None]:
# Verify data has been loaded using DocSet Query to retrieve chunks
query_docs = ctx.read.opensearch(os_client_args=os_client_args, index_name=index_name, query={"query": {"match_all": {}}})
query_docs.show(show_embedding=False)