The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
import pyarrow.fs
import sycamore
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder
from opensearchpy import OpenSearch

In [None]:
# Define a function used for cleaning and formatting later on in the notebook

from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
paths = ["s3://aryn-public/NTSB/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Partition with the Aryn partitioner remotely, pulling out tables and images.
    .partition(partitioner=ArynPartitioner(extract_images=True,  extract_table_structure=True))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from Documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each Document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each Document. For use with near-duplicate detection
    .sketch()
    # Embed each Document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
)

In [None]:
# Write to a persistent OpenSearch Index. Note: You must have a specified opensearch instance running for this to work. For more information on how to set
# one up, refer to https://opensearch.org/docs/latest/install-and-configure/install-opensearch/index/
# - into a specific database (as specified by os_client_args) 
# - into a specific index (as specified by index_name)
# - with specific index settings (as specified by index_settings)
os_client_args = {
        "hosts": [{"host": "localhost", "port": 9200}],
        "http_compress": True,
        "http_auth": ("admin", "admin"),
        "use_ssl": True,
        "verify_certs": False,
        "ssl_assert_hostname": False,
        "ssl_show_warn": False,
        "timeout": 120,
    }

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1,
        },
        "mappings": {
            "properties": {
                "embedding": {
                    "type": "knn_vector",
                    "dimension": 384,
                    "method": {"name": "hnsw", "engine": "faiss"},
                },
                "text": {"type": "text"},
            }
        },
    }
}
index_name = "demo_index"
ds.write.opensearch(
    os_client_args=os_client_args,
    index_name=index_name,
    index_settings=index_settings,
)

In [None]:
# Let's initialize the OpenSearch client and connect it to the database to perform queries
client = OpenSearch(**os_client_args)

print(client.info())

In [None]:
# Check for text search within OpenSearch working

result = client.search(index=index_name, body={
    "query": {
        "match": {
            "text_representation": "traffic"
        }
    },
    "_source": ["text_representation"],
    "size": 5
})
# Process and print the results
for hit in result['hits']['hits']:
    text = hit['_source']['text_representation']
    print(hit)
    print("-" * 50)  # Separator between results

In [None]:
# For queries, let's define an embedding function for the question that helps us easily run NN search 
# by comparing the two vectors and generating optimal results
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
from langchain_core.embeddings import Embeddings

class Embedder(Embeddings):
    def __init__(self, llm):
        self.llm = llm
    
    def embed_query(self, question):
        v = self.llm.encode(question).tolist()
        return v
    
    def embed_documents(self, documents):
        ans_list = [self.llm.encode(doc).tolist() for doc in documents]
        return ans_list

embedder = Embedder(minilm)

In [None]:
# Let us now run NN queries using OpenSearch. First, we define a small function to pretty print the results and define an embedding function 
# for the question that helps us easily run NN search by comparing the two vectors and generating optimal results

def pretty_response(response):
    for hit in response['hits']['hits']:
        id = hit['_id']
        score = hit['_score']
        text = hit['_source']['text_representation']
        pretty_output = (f"\nID: {id}\nSummary: {text}\nScore: {score}")
        print(pretty_output)

response = client.search(
  index = index_name,
  body = {"query":  {"knn": { "embedding" : {
      "vector": embedder.embed_query("How do I prevent accidents?"),
      "k": 10,}}
  }
  }
)
pretty_response(response)

top_hit_summary = response['hits']['hits'][0]['_source']['text_representation']

In [None]:
## Before trying to begin RAG on the OpenSearch vector store with Langchain, we must add a source field in the metadata of each document.

# Define the update script
script = {
    "source": """
    if (ctx._source.metadata == null) {
        ctx._source.metadata = new HashMap();
    }
    ctx._source.metadata.source = 'default';
    """
}

# Perform the update
response = client.update_by_query(
    index=index_name,
    body={
        "query": {
            "match_all": {}
        },
        "script": script
    }
)

In [None]:
## The Langchain OpenSearch integration does not allow for nested text fields currently and wants "embeddings" to be named as a "vector_field",
#  so we bring the text and vector fields out of the properties dictionary as independent columns.

client.indices.put_mapping(
    index=index_name,
    body={
        "properties": {
            "vector_field": {
                    "type": "knn_vector",
                    "dimension": 384,
                    "method": {"name": "hnsw", "engine": "faiss"},
        },
            "text": {
                "type": "text"
            }
        }
    }
)

response = client.update_by_query(
    index=index_name,
    body={
        "script": {
            "source": """ctx._source.vector_field = ctx._source.embedding;
        ctx._source.text = ctx._source.text_representation;""",
            "lang": "painless"
        }
    },
    refresh=True
)


In [None]:
# Finally, we initialize a RAG agent and ask the model a question about the data. Note you must have the http certificates for OpenSearch installed for
# this to work correctly

from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQAWithSourcesChain  
import os

os_client_args.pop("hosts", None)
vector_store = OpenSearchVectorSearch(opensearch_url="https://localhost:9200",
    index_name=index_name,
   embedding_function= embedder,**os_client_args)

llm = ChatOpenAI(  
    openai_api_key=os.environ.get("OPENAI_API_KEY"),  
    model_name='gpt-3.5-turbo',  
    temperature=0.3
)  
qa = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vector_store.as_retriever() , verbose=True
)  
qa.invoke({"question": "How many accidents happened?"})