##### In this example, we will write the output of the Sycamore job from pdf to a target vector and keyword indexes in OpenSearch.

##### The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

##### You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.



In [None]:
import sycamore
import os
# s3 file path to the SortBenchmark dataset
paths = "s3://aryn-public/sort-benchmark/pdf/"


# Initializng sycamore which also initializes Ray underneath
context = sycamore.init()

# Creating a DocSet
docset = context.read.binary(paths, parallelism=1, binary_format="pdf")

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
from sycamore.transforms.partition import ArynPartitioner

# We are using ArynPartitioner to partion the documents.
# Sycamore supports pluggable partitioners for different formats.
docset = docset.partition(partitioner=ArynPartitioner())

In [None]:
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.llms import OpenAIModels, OpenAI


# The following prompt templates will be used to extract the relevant entities
title_prompt_template = """
    ELEMENT 1: Jupiter's Moons
    ELEMENT 2: Ganymede 2020
    ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
    ELEMENT 4: From Wikipedia, the free encyclopedia
    ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
    =========
    "Ganymede 2020"

    ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
    ELEMENT 2: Tarun Kalluri * UCSD
    ELEMENT 3: Deepak Pathak CMU
    ELEMENT 4: Manmohan Chandraker UCSD
    ELEMENT 5: Du Tran Facebook AI
    ELEMENT 6: https://tarun005.github.io/FLAVR/
    ELEMENT 7: 2 2 0 2
    ELEMENT 8: b e F 4 2
    ELEMENT 9: ]
    ELEMENT 10: V C . s c [
    ========
    "FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation"

    """

author_prompt_template = """
    ELEMENT 1: Jupiter's Moons
    ELEMENT 2: Ganymede 2020
    ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
    ELEMENT 4: From Wikipedia, the free encyclopedia
    ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
    =========
    Audi Laupe, Serena K. Goldberg

    ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
    ELEMENT 2: Tarun Kalluri * UCSD
    ELEMENT 3: Deepak Pathak CMU
    ELEMENT 4: Manmohan Chandraker UCSD
    ELEMENT 5: Du Tran Facebook AI
    ELEMENT 6: https://tarun005.github.io/FLAVR/
    ELEMENT 7: 2 2 0 2
    ELEMENT 8: b e F 4 2
    ELEMENT 9: ]
    ELEMENT 10: V C . s c [
    ========
    Tarun Kalluri, Deepak Pathak, Manmohan Chandraker, Du Tran

    """

# We are using OpenAIEntityExtractor which utilizes OpenAI and gpt-3.5-turbo model.
# You can write your own EntityExtractor as well.

# Replace the "api-key" with your API Key.
openai = OpenAI(OpenAIModels.GPT_3_5_TURBO.value, api_key=os.environ.get("OPENAI_API_KEY"))

docset = docset.extract_entity(
            entity_extractor=OpenAIEntityExtractor("title", llm=openai, prompt_template=title_prompt_template)
        ).extract_entity(
            entity_extractor=OpenAIEntityExtractor("authors", llm=openai, prompt_template=author_prompt_template)
        )

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder

# We are using SentenceTransformerEmbedder to embed the content of each document; which
# uses the SentenceTransformer model. You can write your own Embedder as well.
docset = docset.explode().sketch().embed(
        embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name="sentence-transformers/all-MiniLM-L6-v2")
    )

In [None]:
openSearch_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 2,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
                "embeddings": {
                    "type": "knn_vector",
                    "dimension": 384,
                    "method": {"name": "hnsw", "engine": "faiss"},
                },
                "text": {"type": "text"},
            }
        },
    }
}

docset.write.opensearch(
    os_client_args=openSearch_client_args,
    index_name="sort-benchmark",
    index_settings=index_settings,
)