# NTSB demo

Sycamore is a **Document Processing System** = **ETL** + **Query Planning/Retrieval**.

https://aryn-public.s3.amazonaws.com/ntsb/22.pdf

**ETL steps**:
- extract Text + Structure: titles, section headers, text paragraphs, figures, tables and their cells, captions, page headers and footers, footnotes
- summarize images
- extract data from tables
- standardize locations and dates

**Query Planning/Retrieval**:
Having these structural elements and information extracted enables the consumer of these document to have much better control over what is being embedded and how, leading to better retrieval in a RAG system.
For example, we can focus the retrieval on certain sections or eliminate them altogether.

**Question answering**:
Sycamore can also extract information from a document. We’ll see how it can extract location and dates from NTSB docs, but also specific elements like aircraft type.  

Once the data is transformed, we can ask questions on it.

In [None]:
import sycamore 
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.summarize_images import SummarizeImages
from sycamore.transforms import (AssignDocProperties, 
                                ExtractTableProperties, 
                                StandardizeProperty, 
                                USStateStandardizer,
                                DateTimeStandardizer)

from sycamore.llms import OpenAI 
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
from sycamore.utils.pdf_utils import show_pages, enumerate_images_and_tables, display_page_and_table_properties
from sycamore.materialize import MaterializeSourceMode  
import json
from pathlib import Path

In [None]:
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

In [None]:
ctx = sycamore.init()

In [None]:
s3_path = "s3://aryn-public/ntsb/22.pdf"
llm = OpenAI("gpt-4o-mini")
materialize_dir = Path.cwd() / "cache"


In [None]:
# Create a docset from segmented PDFs, extract tables, summarize images
docset = (
    ctx.read.binary(s3_path, binary_format="pdf")

    # Partition and extract tables and images
    .partition(ArynPartitioner(
                               extract_table_structure=True,
                               use_ocr=True, 
                               extract_images=True)
              )
    
    # Summarize each image element
    .transform(SummarizeImages)

    # Materialize each document
    .materialize(
        path=f"{materialize_dir}/docset_summarized",
        source_mode=MaterializeSourceMode.IF_PRESENT)
)

In [None]:
show_pages(docset, limit=25)

In [None]:
enumerate_images_and_tables(docset.take_all())

In [None]:
prompt_LLM = """
            You are given a text string where columns are separated by comma representing either a single column, 
            or multi-column table each new line is a new row.
            Instructions:
            1. Parse the table and return a flattened JSON object representing the key-value pairs of properties 
            defined in the table.
            2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types 
            are numbers, strings, and lists.
            3. If you find multiple fields defined in a row, feel free to split them into separate properties.
            4. Use camelCase for the key names
            5. For fields where the values are in standard measurement units like miles, 
            nautical miles, knots, celsius
            6. return only the json object between ``` 
            - include the unit in the key name and only set the numeric value as the value.
            - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, 
            "Temperature: 3°C" should become temperatureInC: 3
            """



ds_extracted = (
    docset
    # Extract properties from tables and save it as key value pair in respective table elements
    .map( lambda doc: ExtractTableProperties.extract_table_properties( doc, property_name = "table_props", llm =llm, prompt_LLM=prompt_LLM)) 

    # Materialize document
    .materialize(
        path = f"{materialize_dir}/docset_extrTblprop",
        source_mode=MaterializeSourceMode.IF_PRESENT)
)

ds_extracted.show(limit=6)


In [None]:
# we assign properties from 1st table element to document level 
ds_prop1 = (ds_extracted.
            map( lambda doc : AssignDocProperties.assign_doc_properties( doc, element_type="table", property_name = "table_props")))
ds_prop1.show(limit=6, show_elements = False)

In [None]:

# We noramalize the date and location using USStateStandardizer and DateTimeStandardizer transform


ds_normd = (
    ds_prop1
    
    # Converts state abbreviations to their full names.
    .map( lambda doc: USStateStandardizer.standardize(doc, key_path = ["properties","entity","location"]))

    # Converts datetime into a common format
    .map( lambda doc: DateTimeStandardizer.standardize(doc, key_path = ["properties","entity","dateTime"]))

    # Materialize document
    .materialize(
        path=f"{materialize_dir}/docset_normalized",
        source_mode=MaterializeSourceMode.IF_PRESENT)
)
ds_normd.show(limit=6, show_elements = False)


In [None]:
# Show tables and their conversion to properties
display_page_and_table_properties(ds_normd.take())

In [None]:
## if removed, remove json import up top
from IPython.display import display, HTML
for e in ds_normd.take_all()[0].elements:
    if "table" in e.type and e.table is not None :
        print("Element Type: ", e.type)
        print("Element Properties: ", json.dumps(e.properties, indent=2, default=str))
        display(HTML(e.table.to_html()))

In [None]:
import os
if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a Sycamore Jupyter container, using opensearch for OpenSearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for OpenSearch host")

index = "ntsb-demo-all"
os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                }
            }
        }
    }
}

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder

embedded_docset = (
    ds_normd
    .spread_properties(["entity", "path"])
    .explode()
    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2"))
    
)

In [None]:
# embedded_docset.map(validate).write.opensearch(
embedded_docset.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)

In [None]:
from sycamore.transforms.query import OpenSearchQueryExecutor
from sycamore.data import OpenSearchQuery
from opensearchpy import OpenSearch

os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ('admin', 'admin'),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120
    
}


client = OpenSearch(**os_client_args)

osq = OpenSearchQueryExecutor(os_client_args)

In [None]:
question = "Were there any incidents involving Cirrus airplanes"

In [None]:
query = OpenSearchQuery()
query["index"] = index
query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {"match": {"text_representation": question}},
    "size": 20
}
result = osq.query(query)["result"]

In [None]:
result

In [None]:
# get the embedding model ID 
import requests
def get_model_id():
    query = {
        'query': {
            'bool': {
                'must': [
                    {
                        'match': {'name': 'all-MiniLM-L6-v2'},
                    },
                    {
                        'term': {'model_config.model_type': 'bert'},
                    },
                ],
            },
        },
    }
    with requests.get(f'https://{opensearch_host}:9200/_plugins/_ml/models/_search', json=query, verify=False) as resp:
        res = json.loads(resp.text)
        return res['hits']['hits'][0]['_id']

In [None]:
# RAG config
embedding_model_id = get_model_id()
search_pipeline = "hybrid_rag_pipeline"
llm = "gpt-4o"

rag_query = OpenSearchQuery()
rag_query["index"] = index

# hybrid searc
rag_query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {
        "hybrid": {
            "queries": [
                {"match": {"text_representation": question}},
                {
                    "neural": {
                        "embedding": {
                            "query_text": question,
                            "model_id": embedding_model_id,
                            "k": 100,
                        }
                    }
                },
            ]
        }
    },
    "size": 20
}

# RAG params
rag_query["params"] = {"search_pipeline": search_pipeline}
rag_query["query"]["ext"] = {
    "generative_qa_parameters": {
        "llm_question": question,
        "context_size": 10,
        "llm_model": llm,
    }
}
rag_result = osq.query(rag_query)["result"]

In [None]:
rag_result["ext"]["retrieval_augmented_generation"]["answer"]

In [None]:
filtered_rag_query = OpenSearchQuery()
filtered_rag_query["index"] = index
question = "Were there any airplace incidents in Indiana on January 24, 2023?"
filter = {
    "bool": {
        "must": [
          {
            "match_phrase": {
              "properties.entity.location": "Indiana"
            }
          }
          
        ]
    }
}

# hybrid search
filtered_rag_query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {
        "hybrid": {
            "queries": [
                {
                  "bool": {
                      "must": [
                          {
                              "match": {
                                  "text_representation": question
                              }
                          }
                      ],
                      "filter": filter
                  }
                },
                {
                    "neural": {
                        "embedding": {
                            "query_text": question,
                            "model_id": embedding_model_id,
                            "k": 100,
                            "filter": filter
                        }
                    }
                },
            ]
        }
    },
    "size": 20
}

# RAG params
filtered_rag_query["params"] = {"search_pipeline": search_pipeline}

filtered_rag_query["query"]["ext"] = {
    "generative_qa_parameters": {
        "llm_question": question,
        "context_size": 10,
        "llm_model": llm,
    }
}

In [None]:
filtered_rag_result = osq.query(filtered_rag_query)["result"]

In [None]:
filtered_rag_result["ext"]["retrieval_augmented_generation"]["answer"]