In [None]:
# First some imports
import pyarrow.fs
from ray.data import ActorPoolStrategy
import sycamore
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [None]:
# Here's a function to convert llm-generated date/time strings into well-structured datetime strings
from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.isoformat()
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [None]:
paths = ["s3://aryn-public/ntsb/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

# Main ingest pipeline. Note the use of `.term_frequency()`, which will enable hybrid search in pinecone
ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True), compute=ActorPoolStrategy(size=4))
    .regex_replace(COALESCE_WHITESPACE)
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    .map(convert_timestamp)
    .spread_properties(["path", "entity"])
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    .explode()
    .sketch()
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
    .term_frequency(tokenizer=tokenizer, with_token_ids=True)
)

In [None]:
# Write to pinecone
import pinecone
ds.write.pinecone(
    index_name="ntsb",
    index_spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1"),
    dimensions=384,
    distance_metric="dotproduct",
)

In [None]:
pc = pinecone.Pinecone()
pc.describe_index("ntsb")

In [None]:
ntsb = pc.Index(name="ntsb")
ntsb.describe_index_stats()

In [None]:
# Pinecone needs us to generate our own query vectors
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def q_vec(question):
    v = minilm.encode(question).tolist()
    return v

In [None]:
# Query the ntsb index with only dense semantic search
def pure_semantic_query(question):
    results = ntsb.query(
        top_k=5,
        vector=q_vec(question),
        include_metadata=True,
    )
    return results

In [None]:
r = pure_semantic_query("plane crashes in california")

In [None]:
# Pretty printing
def print_results_pretty(results):
    hits = results.get("matches")
    metadata = [h.get("metadata") for h in hits]
    for m in metadata:
        print(f"{m.get('properties.entity.accidentNumber', 'UNKNOWN')} {'='*80}")
        print(f"Aircraft: {m.get('properties.entity.aircraft', 'UNKNOWN')}")
        print(f"Location: {m.get('properties.entity.location', 'UNKNOWN')}")
        print(f"Date:     {m.get('properties.entity.day', 'UNKNOWN')}")
        print(f"Damage:   {m.get('properties.entity.aircraftDamage', 'UNKNOWN')}")
        print(f"Text:     {m.get('text_representation')}")

In [None]:
print_results_pretty(r)

In [None]:
# We also have to generate our own sparse vectors. Note that the pinecone implementation of
# sparse vector search makes it difficult to perform BM-25 (TF/IDF) search. Instead we use 
# pure term frequency, which will cause some common words to be inordinately 'meaningful'.
from collections import Counter

def s_vec(question):
    tokens = tokenizer.tokenize(question, as_ints=True)
    table = dict(Counter(tokens))
    indices = list(table.keys())
    values = [float(v) for v in table.values()]
    return {"indices": indices, "values": values}

In [None]:
s_vec("Mary had a little little lamb")

In [None]:
# Function to combine sparse (TF) and dense (embedding) vectors into a 
# query. Alpha is the weight to give to the dense vector as opposed to
# the sparse vector... alpha=1 means only care about the dense vector 
# and alpha=0 means only care about the sparse vector.
def hybrid_query(question, alpha=0.8):
    qv = q_vec(question)
    sv = s_vec(question)
    qv = [v * alpha for v in qv]
    sv['values'] = [v * (1 - alpha) for v in sv['values']]
    results = ntsb.query(
        top_k=5,
        vector=qv,
        sparse_vector=sv,
        include_metadata=True,
    )
    return results


In [None]:
rh = hybrid_query("plane crashes in california", alpha=0.8)

In [None]:
print_results_pretty(rh)

In [None]:
# When we're looking for specific pieces of information, 'fuzzy' hybrid/semantic search
# won't quite cut it.
print_results_pretty(hybrid_query("plane crashes where the plane was destroyed"))

In [None]:
# Instead we can add filters.
def hybrid_query_filtered(question, filter, alpha=0.8):
    qv = q_vec(question)
    sv = s_vec(question)
    qv = [v * alpha for v in qv]
    sv['values'] = [v * (1 - alpha) for v in sv['values']]
    results = ntsb.query(
        top_k=5,
        vector=qv,
        sparse_vector=sv,
        include_metadata=True,
        filter=filter
    )
    return results

In [None]:
print_results_pretty(hybrid_query_filtered("plane crashes where the plane was destroyed", filter={"properties.entity.aircraftDamage": {"$eq": "Destroyed"}}))