In [None]:
from sycamore.data import Document
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import UnstructuredPdfPartitioner
import sycamore

In [None]:
from dateutil import parser

def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity']['dateAndTime']
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date
    
    return doc

### Data loading

In [None]:
s3_path = "s3://aryn-public/ntsb/"
llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

In [None]:
docset = (
    ctx.read.binary(s3_path, parallelism=4, binary_format="pdf")
    .limit(1) # limit this for testing performance
    .partition(partitioner=UnstructuredPdfPartitioner(min_partition_length=200))
    )

### Data extraction

In [None]:
docset = (
    docset
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    .merge(GreedyTextElementMerger(tokenizer, 300))
    )

In [None]:
docset = docset.map(convert_timestamp)

In [None]:
sample_docs = docset.take()

In [None]:
sample_docs[0].properties['_schema']

In [None]:
sample_docs[0].properties['entity']

In [None]:
for d in sample_docs:
    print(f"Path: {d.properties['path']}")
    for k in ["dateAndTime", "day", "aircraft", "location"]:
            print(f"{k: <25} {d.properties['entity'].get(k, 'None')}")
    print()

## Indexing

In [None]:
from sycamore.transforms.embed import SentenceTransformerEmbedder

index = "ntsb"
os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                }
            }
        }
    }
}

In [None]:
docset = (
    docset
    .spread_properties(["entity"])
    .explode()
    .sketch()
    .embed(embedder=SentenceTransformerEmbedder(batch_size=100, model_name="sentence-transformers/all-MiniLM-L6-v2"))
)

In [None]:
docset.write.opensearch(
    os_client_args=os_client_args,
    index_name=index,
    index_settings=index_settings,
)