In [5]:
import pyarrow.fs
from ray.data import ActorPoolStrategy
import sycamore
from elasticsearch import Elasticsearch
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import UnstructuredPdfPartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [2]:
# Define a function used for cleaning and formatting later on in the notebook

from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [3]:
paths = ["s3://aryn-public/NTSB/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Parition with the Aryn partitioner, pulling out tables and images. Set an appropriate compute to gain the maximum parallelism and performance
    # from the machine, ActorPoolStrategy(size=4) worked best for our case here
    .partition(partitioner=UnstructuredPdfPartitioner(), compute=ActorPoolStrategy(size=4))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from Documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each Document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each Document. For use with near-duplicate detection
    .sketch()
    # Embed each Document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
)

In [17]:
# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work. For more information on how to set
# one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html
# - into a specific database (as specified by url) 
# - into a specific index (as specified by t)
url = "http://localhost:9201"
index_name = "test_index"
ds.write.elasticsearch(url=url, index_name=index_name)

2024-07-15 17:59:15,539	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-07-15_17-37-44_746435_11753/logs/ray-data
2024-07-15 17:59:15,539	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)] -> ActorPoolMapOperator[MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract)] -> TaskPoolMapOperator[MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_drop_header_and_footer)->MapBatches(mark_break_page)->MapBatches(mark_break_by_column)->MapBatches(mark_break_by_tokens)->MapBatches(merge_elements)->MapBatches(convert_timestamp)->MapBatches(spread_properties)->MapBatches(split_doc)->MapBatches(explode)->MapBatches(sketcher)->MapBatches(SentenceTransformerEmbedder)->MapBatches(elastic_document_writer)]


- ReadBinary->SplitBlocks(5) 1:   0%|          | 0/5 [00:00<?, ?it/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2:   0%|          | 0/5 [00:00<?, …

- MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract) 3:   0%|          | 0/5 [00:00<?, ?it…

- MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_d…

Running 0:   0%|          | 0/5 [00:00<?, ?it/s]

<IPython.core.display.HTML object>e)->MapBatches(BaseMapTransformCustom__Extract)) pid=21457) 
<IPython.core.display.HTML object>e)->MapBatches(BaseMapTransformCustom__Extract)) pid=21457) 
<IPython.core.display.HTML object>e)->MapBatches(BaseMapTransformCustom__Extract)) pid=21457) 
<IPython.core.display.HTML object>e)->MapBatches(BaseMapTransformCustom__Extract)) pid=21457) 




In [9]:
# Let's initialize the Elasticsearch client and connect it to the database to perform queries
client = Elasticsearch(
    url
)

print(client.info())

{'name': '3b979cc72e9d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'rNXKhq5STYelYaU1xEN_rA', 'version': {'number': '8.14.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '2afe7caceec8a26ff53817e5ed88235e90592a1b', 'build_date': '2024-07-01T22:06:58.515911606Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [28]:
# Check for text search within Elasticsearch working

result = client.search(index=index_name, body={
    "query": {
        "match": {
            "properties.text_representation": "traffic"
        }
    },
    "_source": ["properties.text_representation"],
    "size": 5
})

# Process and print the results
for hit in result['hits']['hits']:
    text = hit['_source']['properties']['text_representation']
    print(text)
    print("-" * 50)  # Separator between results

Meteorological Information and Flight Plan
Conditions at Accident Site:
Visual (VMC)
Condition of Light:
Day
Observation Facility, Elevation:
Observation Time:
Distance from Accident Site:
Direction from Accident Site:
Lowest Cloud Condition:
Visibility
Lowest Ceiling:
Visibility (RVR):
Wind Speed/Gusts:
9 knots /
Turbulence Type Forecast/Actual:
Wind Direction:
360°
Turbulence Severity Forecast/Actual:
Altimeter Setting:
Temperature/Dew Point:
Precipitation and Obscuration:
Departure Point:
McMurdo (NZPG)
Type of Flight Plan Filed:
None
Destination:
McMurdo (NZPG)
Type of Clearance:
None
Departure Time:
Type of Airspace:
Unknown
Airport Information
Airport:
Pegasus McMurdo, Antarctica NZPG
Runway Surface Type:
Ice
Airport Elevation:
18 ft msl
Runway Surface Condition:
Ice
Runway Used:
33
IFR Approach:
None
Runway Length/Width:
10000 ft / 200 ft
VFR Approach/Landing:
Go around;Traffic pattern
Wreckage and Impact Information
Crew Injuries:
Aircraft Damage:
Substantial
Passenger Injuries

In [49]:
# For queries, let's define an embedding function for the question that helps us easily run NN search 
# by comparing the two vectors and generating optimal results
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

class Embedder():
    def __init__(self, llm):
        self.llm = llm
    
    def embed_query(self, question):
        v = self.llm.encode(question).tolist()
        return v

embedder = Embedder(minilm)



In [72]:
# Let us now run NN queries using Elasticsearch. First, we define a small function to pretty print the results and define an embedding function 
# for the question that helps us easily run NN search by comparing the two vectors and generating optimal results

def pretty_response(response):
    for hit in response['hits']['hits']:
        id = hit['_id']
        score = hit['_score']
        text = hit['_source']['properties']['text_representation']
        pretty_output = (f"\nID: {id}\nSummary: {text}\nScore: {score}")
        print(pretty_output)

response = client.search(
  index = index_name,
  knn={
      "field": "embeddings",
      "query_vector": embedder.embed_query("How do I prevent accidents?"),
      "k": 10,
      "num_candidates": 10
    }
)
pretty_response(response)

top_hit_summary = response['hits']['hits'][0]['_source']['properties']['text_representation']


ID: f98c5018-86ad-426e-9f57-22acb6aef49a
Summary: The National Transportation Safety Board determines the probable cause(s) of this accident to be:
Page 1 of 5

Score: 0.70878434

ID: e9cb97c7-c3df-42e7-b2d0-1aab568182db
Summary: The National Transportation Safety Board determines the probable cause(s) of this accident to be:
Page 1 of 5

Score: 0.70878434

ID: 0031397f-88af-4a61-b940-70113ee2be63
Summary: The National Transportation Safety Board determines the probable cause(s) of this accident to be: THE PILOT'S CONTINUED FLIGHT INTO ADVERSE WEATHER CONDITIONS. FACTORS WERE THE ICING CONDITIONS PREVAILING AT THE DESTINATION AIRPORT, AND THE PILOT'S INABILITY TO MAINTAIN VISUAL LOOKOUT DUE TO WINDSHIELD ICING.
Page 1 of 7

Score: 0.68199754

ID: 2adc9b9c-7ba7-4bb6-aa1b-847f78fcab95
Summary: The National Transportation Safety Board determines the probable cause(s) of this accident to be: THE PILOT'S CONTINUED FLIGHT INTO ADVERSE WEATHER CONDITIONS. FACTORS WERE THE ICING CONDITIONS PR

In [54]:
## Before trying to begin RAG on this vector store, we must add a source field in the metadata of each document. We use the script below to do so

# Define the update script
script = {
    "source": """
    if (ctx._source.metadata == null) {
        ctx._source.metadata = new HashMap();
    }
    ctx._source.metadata.source = 'default';
    """
}

# Perform the update
response = client.update_by_query(
    index=index_name,
    body={
        "query": {
            "match_all": {}
        },
        "script": script
    }
)

In [81]:
# Finally, we initialize a RAG agent and ask the model a question about the data

from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQAWithSourcesChain  
import os

vector_store = ElasticsearchStore(index_name=index_name, es_connection=client, vector_query_field="embeddings", query_field="properties.text_representation",
                                   embedding=embedder)

llm = ChatOpenAI(  
    openai_api_key=os.environ.get("OPENAI_API_KEY"),  
    model_name='gpt-3.5-turbo',  
    temperature=0.2
)  
qa = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vector_store.as_retriever() , verbose=True
)  
qa.invoke({"question": "What are some probable causes of accidents?"})



> Entering new RetrievalQAWithSourcesChain chain...

> Finished chain.


{'question': 'What are some probable causes of accidents?',
 'answer': "I don't know.\n",
 'sources': ''}