The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/get-started) to get a free API key for the service. This is the recommended configuration.

You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
# First some imports
import pyarrow.fs
import sycamore
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [None]:
# Here's a function to convert llm-generated date/time strings into well-structured datetime strings
# We also extract the exact day, month, and year as integers in order to do range filtering in our
# queries
from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
paths = ["s3://aryn-public/ntsb/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

# Main ingest pipeline. Note the use of `.term_frequency()`, which will enable hybrid search in pinecone
ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Partition with the Aryn partitioner remotely, pulling out tables and images.
    .partition(partitioner=ArynPartitioner(extract_images=True,  extract_table_structure=True))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each document. For use with near-duplicate detection
    .sketch()
    # Embed each document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
    # Count the number of occurrences of every token for each document
    .term_frequency(tokenizer=tokenizer, with_token_ids=True)
)

In [None]:
# Write to pinecone
# - into an index named 'ntsb'
# - into a serverless index in aws/us-east-1
# - into a 384-dimensional index. The embedding model we used (all-MiniLM-L6-v2) generates 384-dimensional vectors
# - into an index using the dotproduct distance metric. This is necessary to do sparse vector search
import pinecone
import os

ds.write.pinecone(
    index_name="ntsb",
    index_spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1"),
    dimensions=384,
    distance_metric="dotproduct",
)

In [None]:
# Tell me about my index
import pinecone
pc = pinecone.Pinecone()
pc.describe_index("ntsb")

In [None]:
# How many things are in my index?
ntsb = pc.Index(name="ntsb")
ntsb.describe_index_stats()

In [None]:
# Pinecone needs us to generate our own query vectors, so we define a function to simplify this
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def q_vec(question):
    v = minilm.encode(question).tolist()
    return v

In [None]:
# Function to query the ntsb index with only dense semantic search (using q_vec to generate the vector)
def pure_semantic_query(question):
    results = ntsb.query(
        top_k=5,
        vector=q_vec(question),
        include_metadata=True,
    )
    return results

In [None]:
r = pure_semantic_query("incidents in california")

In [None]:
# Pretty printing
def print_results_pretty(results):
    hits = results.get("matches")
    metadata = [h.get("metadata") for h in hits]
    for m in metadata:
        day = int(m.get('properties.entity.day', -1))
        month = int(m.get('properties.entity.month', -1))
        year = int(m.get('properties.entity.year', -1))
        print(f"{m.get('properties.entity.accidentNumber', 'UNKNOWN')} {'='*80}")
        print(f"Aircraft: {m.get('properties.entity.aircraft', 'UNKNOWN')}")
        print(f"Location: {m.get('properties.entity.location', 'UNKNOWN')}")
        print(f"Date:     {year}-{month}-{day}")
        print(f"Damage:   {m.get('properties.entity.aircraftDamage', 'UNKNOWN')}")
        print(f"Text:     {m.get('text_representation')}")

In [None]:
print_results_pretty(r)

In [None]:
# We also have to generate our own sparse vectors. Note that the pinecone implementation of
# sparse vector search makes it difficult to perform BM-25 (TF/IDF) search. Instead we use 
# pure term frequency, which will cause some common words to be inordinately 'meaningful'.
from collections import Counter

def s_vec(question):
    tokens = tokenizer.tokenize(question, as_ints=True)
    table = dict(Counter(tokens))
    indices = list(table.keys())
    values = [float(v) for v in table.values()]
    return {"indices": indices, "values": values}

In [None]:
# For example...
s_vec("Mary had a little little lamb")

In [None]:
# Function to combine sparse (TF) and dense (embedding) vectors into a 
# query. Alpha is the weight to give to the dense vector as opposed to
# the sparse vector... alpha=1 means only care about the dense vector 
# and alpha=0 means only care about the sparse vector.
def hybrid_query(question, alpha=0.8):
    qv = q_vec(question)
    sv = s_vec(question)
    qv = [v * alpha for v in qv]
    sv['values'] = [v * (1 - alpha) for v in sv['values']]
    results = ntsb.query(
        top_k=5,
        vector=qv,
        sparse_vector=sv,
        include_metadata=True,
    )
    return results


In [None]:
rh = hybrid_query("incidents in california", alpha=0.8)

In [None]:
print_results_pretty(rh)

In [None]:
# When we're looking for specific pieces of information, 'fuzzy' hybrid/semantic search
# won't quite cut it. In this example, we get a bunch of results outside of the specified
# time window.
print_results_pretty(hybrid_query("incidents in the last 2 weeks of january 2023"))

In [None]:
# Instead we can add filters.
def hybrid_query_filtered(question, filter, alpha=0.8):
    qv = q_vec(question)
    sv = s_vec(question)
    qv = [v * alpha for v in qv]
    sv['values'] = [v * (1 - alpha) for v in sv['values']]
    results = ntsb.query(
        top_k=5,
        vector=qv,
        sparse_vector=sv,
        include_metadata=True,
        filter=filter
    )
    return results

In [None]:
# This filter says year=2023, month=1, and day>=17 (31 - 14 = 17)
# When we includ it, all of our search results are from the correct time window
filter = {
    "$and": [
        {"properties.entity.year": {"$eq": 2023}},
        {"properties.entity.month": {"$eq": 1}},
        {"properties.entity.day": {"$gte": 17}}
    ]
}
print_results_pretty(hybrid_query_filtered("incidents in the last two weeks of january", filter=filter))