# NTSB demo

Sycamore is a **Document Processing System** = **ETL** + **Query Planning/Retrieval**.

https://aryn-public.s3.amazonaws.com/ntsb/59.pdf

**ETL steps**:
- extracts Text + Structure:
    - titles,
    - section headers,
    - text paragraphs,
    - figures,
    - tables and their cells
    - captions
    - page headers and footers
    - footnotes
- it then can merge individual chunks into larger chunks that preserve more of the context

**Query Planning/Retrieval**:
Having these structural elements and information extracted enables the consumer of these document to have much better control over what is being embedded and how, leading to better retrieval in a RAG system.
For example, we can focus the retrieval on certain sections or eliminate them altogether.

**Question answering**:
Sycamore can also extract information from a document. We’ll see how it can extract location and dates from NTSB docs, but also specific elements like aircraft type.  



Once the data is transformed, we can ask questions on it.

In [None]:
import sycamore 

from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.summarize_images import SummarizeImages

from sycamore.transforms import (AssignDocProperties, 
                                ExtractTableProperties, 
                                StandardizeProperty, 
                                LocationStandardizer,
                                DateTimeStandardizer)

from sycamore.llms import OpenAI 

from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH

from sycamore.utils.pdf_utils import show_pages, enumerate_images_and_tables, display_page_and_table_properties

import json

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

### Initialize Sycamore

In [None]:
ctx = sycamore.init()

## Load the data

We're loading NTSB incident reports (pdf documents describing aviation incidents). 
The documents consist of a combination of tables, text, and figures.

We’re loading the data from a public s3 bucket

Sample document: 
https://aryn-public.s3.amazonaws.com/ntsb/59.pdf

In [None]:
s3_path = "s3://aryn-public/ntsb/59.pdf"
llm = OpenAI('gpt-4o-mini')

## Define a pipeline using SycamorePartitioner

### Chunk and extract document structure
Extracts Text + Structure:
- titles,
- section headers,
- text paragraphs,
- figures,
- tables and their cells
- captions
- page headers and footers
- footnotes

In [None]:
# Create a docset from segmented PDFs, extract tables, summarize images
docset = (
    ctx.read.binary(s3_path, binary_format="pdf")

    # Partition and extract tables and images
    .partition(ArynPartitioner(
                               extract_table_structure=True,
                               use_ocr=True, 
                               extract_images=True)
              )
    
    # Summarize each image element
    .transform(SummarizeImages)
)
    

## Visualize partitioned documents

In [None]:
sample_pages = show_pages(docset, limit=4)


In [None]:
# Show text representation of images and tables
enumerate_images_and_tables(docset.take(1))

In [None]:
# extract properties from tables as key value pair 
ds_extracted = docset.transform(cls = ExtractTableProperties,parameters = ['llm_response', llm])
ds_extracted.show(limit=1)

In [None]:
# AssignDocProperties is used to copy properties from first element of table to the document
element_type = 'table'
property_name = 'llm_response'

ds_prop1 = ds_extracted.transform(cls = AssignDocProperties, parameters = [element_type, property_name])
ds_prop1.show(limit=1, show_elements = False)

##  Inspect extracted information

##  Notice that dates and locations have been standardized

In [None]:
# We noramalize the date and location using LocationStandardizer and DateTimeStandardizer transform
loc_standardizer = LocationStandardizer()
date_standardizer = DateTimeStandardizer()

ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=loc_standardizer, path=[['properties','entity','location']])
ds_normd = ds_prop1.transform(cls = StandardizeProperty, standardizer=date_standardizer, path=[['properties','entity','dateTime']])

ds_normd.show(limit=1, show_elements = False)

In [None]:
show_pages(ds_normd, limit=1)

In [None]:
# Show tables and their conversion to properties
display_page_and_table_properties(ds_normd.take(1))

In [None]:
from IPython.display import display, HTML
for e in ds_normd.take_all()[0].elements:
    if "table" in e.type:
        print("Element Type: ", e.type)
        print("Element Properties: ", json.dumps(e.properties, indent=2, default=str))
        display(HTML(e.table.to_html()))

## Indexing the documents for retrieval

Now that we have extracted the text, partitioned it, labeled the partitions, extracted information and standardized it, we're ready to store it for retrieval

In [None]:
import os
if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a Sycamore Jupyter container, using opensearch for OpenSearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for OpenSearch host")

index = "ntsb-bb-2_demo"
os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                }
            }
        }
    }
}

### Creating embeddings

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder

embedded_docset = (
    ds_normd
    .spread_properties(["entity", "path"])
    .explode()
    .sketch()
    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2"), num_gpus=0.1)
)

### Write the OpenSearch index

In [None]:
# embedded_docset.map(validate).write.opensearch(
embedded_docset.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)

## Answer some questions

### Create a connection to OpenSearch

In [None]:
from sycamore.transforms.query import OpenSearchQueryExecutor
from sycamore.data import OpenSearchQuery
from opensearchpy import OpenSearch

os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ('admin', 'admin'),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120
    
}


client = OpenSearch(**os_client_args)

osq = OpenSearchQueryExecutor(os_client_args)

### Question

In [None]:
question = "Were there any incidents involving red planes"

### Text query

In [None]:
query = OpenSearchQuery()
query["index"] = index
query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {"match": {"text_representation": question}},
    "size": 20
}
result = osq.query(query)["result"]

In [None]:
result

### RAG query

In [None]:
# get the embedding model ID 
import requests
def get_model_id():
    query = {
        'query': {
            'bool': {
                'must': [
                    {
                        'match': {'name': 'all-MiniLM-L6-v2'},
                    },
                    {
                        'term': {'model_config.model_type': 'bert'},
                    },
                ],
            },
        },
    }
    with requests.get(f'https://{opensearch_host}:9200/_plugins/_ml/models/_search', json=query, verify=False) as resp:
        res = json.loads(resp.text)
        return res['hits']['hits'][0]['_id']

In [None]:
# RAG config
embedding_model_id = get_model_id()
search_pipeline = "hybrid_rag_pipeline"
llm = "gpt-4o"

rag_query = OpenSearchQuery()
rag_query["index"] = index

# hybrid searc
rag_query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {
        "hybrid": {
            "queries": [
                {"match": {"text_representation": question}},
                {
                    "neural": {
                        "embedding": {
                            "query_text": question,
                            "model_id": embedding_model_id,
                            "k": 100,
                        }
                    }
                },
            ]
        }
    },
    "size": 20
}

# RAG params
rag_query["params"] = {"search_pipeline": search_pipeline}
rag_query["query"]["ext"] = {
    "generative_qa_parameters": {
        "llm_question": question,
        "context_size": 10,
        "llm_model": llm,
    }
}
rag_result = osq.query(rag_query)["result"]

In [None]:
rag_result["ext"]["retrieval_augmented_generation"]["answer"]

In [None]:
rag_result

### Filtered RAG query

In [None]:
filtered_rag_query = OpenSearchQuery()
filtered_rag_query["index"] = index
question = "Were there any airplace incidents in Texas in January 14, 2023?"
filter = {
    "bool": {
        "must": [
          {
            "match_phrase": {
              "properties.entity.location": "Texas"
            }
          }
          
        ]
    }
}

# hybrid search
filtered_rag_query["query"] = {
    "_source": {"excludes": ["embedding"]},
    "query": {
        "hybrid": {
            "queries": [
                {
                  "bool": {
                      "must": [
                          {
                              "match": {
                                  "text_representation": question
                              }
                          }
                      ],
                      "filter": filter
                  }
                },
                {
                    "neural": {
                        "embedding": {
                            "query_text": question,
                            "model_id": embedding_model_id,
                            "k": 100,
                            "filter": filter
                        }
                    }
                },
            ]
        }
    },
    "size": 20
}

# RAG params
filtered_rag_query["params"] = {"search_pipeline": search_pipeline}

filtered_rag_query["query"]["ext"] = {
    "generative_qa_parameters": {
        "llm_question": question,
        "context_size": 10,
        "llm_model": llm,
    }
}

In [None]:
filtered_rag_result = osq.query(filtered_rag_query)["result"]

In [None]:
filtered_rag_result["ext"]["retrieval_augmented_generation"]["answer"]