##### In this example, we will write the output of the Sycamore job from pdf in S3 to a target vector and keyword indexes in OpenSearch.

##### The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

##### You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
from sycamore.data import Document
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
import sycamore
import os 

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.isoformat()
    if parsed_date.utcoffset():
        doc.properties['entity']['isoDateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['isoDateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

### Data loading

In [None]:
s3_path = "s3://aryn-public/ntsb/"
partition_materialize_path = "s3://aryn-public/materialize/notebooks/partition-ntsb/2024-10-11"

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

In [None]:
docset = (
    ctx.read.binary(s3_path, binary_format="pdf")
    .partition(partitioner=ArynPartitioner())
    .materialize(path=partition_materialize_path, source_mode=sycamore.MATERIALIZE_USE_STORED) # avoid expensive re-partitioning step
    )
docset.execute()

### Data extraction

In [None]:
docset = (
    docset
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    .merge(GreedyTextElementMerger(tokenizer, 300))
    .map(convert_timestamp)
    .materialize("tmp/metadata-extraction/post-convert-timestamp", source_mode=sycamore.MATERIALIZE_USE_STORED) # avoid recomputation after take
    )

In [None]:
sample_docs = docset.take()

In [None]:
sample_docs[0].properties['_schema']

In [None]:
sample_docs[0].properties['entity']

In [None]:
for d in sample_docs:
    print(f"Path: {d.properties['path']}")
    for k in ["dateTime", "day", "aircraft", "location"]:
            print(f"{k: <25} {d.properties['entity'].get(k, 'None')}")
    print()

### Indexing

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder
import os

if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a Sycamore Jupyter container, using opensearch for OpenSearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for OpenSearch host")

index = "ntsb_demoindex0"
os_client_args = {
    "hosts": [{"host": opensearch_host, "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                }
            }
        }
    }
}

In [None]:
docset = (
    docset
    .spread_properties(["entity", "path"])
    .explode()
    .sketch()
    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2"))
)


In [None]:
docset.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)