##### In this example, we will write the output of the Sycamore job from pdf and html files to a target vector and keyword indexes in OpenSearch.

##### The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

##### You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
from pathlib import Path
import datetime
import math
import numpy
import os
import requests
import stat
import sys
import time

import sycamore
from sycamore.functions import HuggingFaceTokenizer, TextOverlapChunker
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner, HtmlPartitioner

index = "demoindex_0"

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
%%bash
mkdir -p default-prep-data/pdf default-prep-data/html
curl https://sortbenchmark.org/ELSAR2022.pdf -o default-prep-data/pdf/elsar.pdf
curl https://sortbenchmark.org -o default-prep-data/html/home.html

In [None]:
if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a sycamore jupyter container, using opensearch for opensearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for opensearch host")
    
def get_os_client_args():
    args = {
        "hosts": [{"host": opensearch_host, "port": 9200}],
        "http_compress": True,
        "http_auth": ("admin", "admin"),
        "use_ssl": True,
        "verify_certs": False,
        "ssl_assert_hostname": False,
        "ssl_show_warn": False,
        "timeout": 120,
    }
    return args

def get_index_settings():
    return {
        "body": {
            "settings": {"index.knn": True, "number_of_shards": 5, "number_of_replicas": 1},
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
                    "embedding": {
                        "dimension": 384,
                        "method": {"engine": "faiss", "space_type": "l2", "name": "hnsw", "parameters": {}},
                        "type": "knn_vector",
                    },
                    "title": {"type": "text"},
                    "searchable_text": {"type": "text"},
                    "title_embedding": {
                        "dimension": 384,
                        "method": {"engine": "faiss", "space_type": "l2", "name": "hnsw", "parameters": {}},
                        "type": "knn_vector",
                    },
                    "url": {"type": "text"},
                }
            },
        }
    }

In [None]:
def get_title_context_template():
    # ruff: noqa: E501
    return """
        ELEMENT 1: Jupiter's Moons
        ELEMENT 2: Ganymede 2020
        ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
        ELEMENT 4: From Wikipedia, the free encyclopedia
        ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
        =========
        "Ganymede 2020"

        ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
        ELEMENT 2: Tarun Kalluri * UCSD
        ELEMENT 3: Deepak Pathak CMU
        ELEMENT 4: Manmohan Chandraker UCSD
        ELEMENT 5: Du Tran Facebook AI
        ELEMENT 6: https://tarun005.github.io/FLAVR/
        ELEMENT 7: 2 2 0 2
        ELEMENT 8: b e F 4 2
        ELEMENT 9: ]
        ELEMENT 10: V C . s c [
        ========
        "FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation"

        """

def get_author_context_template():
    # ruff: noqa: E501
    return """
            ELEMENT 1: Jupiter's Moons
            ELEMENT 2: Ganymede 2020
            ELEMENT 3: by Audi Lauper and Serena K. Goldberg. 2011
            ELEMENT 4: From Wikipedia, the free encyclopedia
            ELEMENT 5: Ganymede, or Jupiter III, is the largest and most massive natural satellite of Jupiter as well as in the Solar System, being a planetary-mass moon. It is the largest Solar System object without an atmosphere, despite being the only moon of the Solar System with a magnetic field. Like Titan, it is larger than the planet Mercury, but has somewhat less surface gravity than Mercury, Io or the Moon.
            =========
            Audi Laupe, Serena K. Goldberg

            ELEMENT 1: FLAVR: Flow-Agnostic Video Representations for Fast Frame Interpolation
            ELEMENT 2: Tarun Kalluri * UCSD
            ELEMENT 3: Deepak Pathak CMU
            ELEMENT 4: Manmohan Chandraker UCSD
            ELEMENT 5: Du Tran Facebook AI
            ELEMENT 6: https://tarun005.github.io/FLAVR/
            ELEMENT 7: 2 2 0 2
            ELEMENT 8: b e F 4 2
            ELEMENT 9: ]
            ELEMENT 10: V C . s c [
            ========
            Tarun Kalluri, Deepak Pathak, Manmohan Chandraker, Du Tran

            """

In [None]:
def import_pdf(paths):
    if len(paths) == 0:
        print("WARNING: import_pdf called with empty paths")
        return

    if (
        (isinstance(paths, str) or isinstance(paths, Path)) 
        and Path(paths).exists() and Path(paths).is_dir()
        and len(list(Path(paths).iterdir())) == 0
    ):
        print("WARNING: import_pdf called with empty directory")
        return

    openai_llm = OpenAI(OpenAIModels.GPT_3_5_TURBO_INSTRUCT.value)
    tokenizer = HuggingFaceTokenizer("sentence-transformers/all-MiniLM-L6-v2")
    merger = GreedyTextElementMerger(tokenizer, 256)

    ctx = sycamore.init(exec_mode=sycamore.EXEC_LOCAL)
    (
        ctx.read.binary(paths, binary_format="pdf", filter_paths_by_extension=False)
        .partition(
            partitioner=ArynPartitioner(),
        )
        .merge(merger)
        .extract_entity(
            entity_extractor=OpenAIEntityExtractor(
                "title", llm=openai_llm, prompt_template=get_title_context_template()
            )
        )
        .extract_entity(
            entity_extractor=OpenAIEntityExtractor(
                "authors", llm=openai_llm, prompt_template=get_author_context_template()
            )
        )
        .spread_properties(["path", "title"])
        .explode()
        .sketch()
        .embed(
            embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2")
        )
        .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())
    )

In [None]:
def import_html(paths):
    if len(paths) == 0:
        print("WARNING: import_html called with empty paths")
        return
    
    if (
        (isinstance(paths, str) or isinstance(paths, Path)) 
        and Path(paths).exists() and Path(paths).is_dir()
        and len(list(Path(paths).iterdir())) == 0
    ):
        print("WARNING: import_html called with empty directory")
        return

    ctx = sycamore.init(exec_mode=sycamore.EXEC_LOCAL)
    (
        ctx.read.binary(paths, binary_format="html", filter_paths_by_extension=False)
        .partition(
            partitioner=HtmlPartitioner(
                extract_tables=True,
                text_chunker=TextOverlapChunker(chunk_token_count=4000, chunk_overlap_token_count=400),
            )
        )
        .spread_properties(["path", "title"])
        .explode()
        .sketch()
        .embed(
            embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2")
        )
        .write.opensearch(os_client_args=get_os_client_args(), index_name=index, index_settings=get_index_settings())
    )

In [None]:
import_html("default-prep-data/html")

In [None]:
import_pdf("default-prep-data/pdf")

In [None]:
if os.path.isdir("/etc/opt/aryn"):
    print(f"Visit the UI at the URL provided at stack creation and use index {index} to query these documents")
else:
    protocol = "https" if os.environ.get("SSL") != "0" else "http"
    print(f"Visit {protocol}://localhost:3000 and use the {index} index to query these documents in the UI")