In [11]:
import pyarrow.fs
from ray.data import ActorPoolStrategy
import sycamore
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [12]:
from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [13]:
paths = ["s3://aryn-public/ntsb/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Parition with the sycamore partitioner, pulling out tables and images. ActorPoolStrategy(size=3) works best on my particular hardware
    # but your mileage may vary depending on your RAM.
    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True), compute=ActorPoolStrategy(size=3))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each document. For use with near-duplicate detection
    .sketch()
    # Embed each document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
)

In [14]:
# Write to a persistent duckdb database
# - into a specific database (as specified by url) 
# - into a specific table (as specified by table name)
persistent_db = "demo.db"
persistent_table = "demo_table"
ds.write.duckdb(
    db_url=persistent_db,
    table_name=persistent_table
)

2024-06-27 15:39:00,438	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-06-27_12-48-51_807856_34612/logs/ray-data
2024-06-27 15:39:00,438	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> ActorPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)] -> ActorPoolMapOperator[MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract)] -> ActorPoolMapOperator[MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_drop_header_and_footer)->MapBatches(mark_break_page)->MapBatches(mark_break_by_column)->MapBatches(mark_break_by_tokens)->MapBatches(merge_elements)->MapBatches(convert_timestamp)->MapBatches(spread_properties)->MapBatches(split_doc)->MapBatches(explode)->MapBatches(sketcher)->MapBatches(SentenceTransformerEmbedder)->MapBatches(compute_term_frequency)->MapBatches(BaseMapTransformCa

- ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 1:   0%|          | 0/…

- MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract) 2:   0%|          | 0/65 [00:00<?, ?i…

- MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_d…

Running 0:   0%|          | 0/65 [00:00<?, ?it/s]

In [26]:
# We connect to the DuckDB to perform operations
import duckdb
data_conn = duckdb.connect("demo.db")

In [56]:
# DuckDB is not production ready yet for efficient Vector Similarity Search on production databases (https://duckdb.org/docs/extensions/vss.html), 
# so we load our data into an in-memory database for this demo. In future versions, once DuckDB productionizes this feature, there will be no more need for this code cell

# Load from disk into Pandas Dataframe as an intermediate step
df = data_conn.execute("SELECT * FROM demo_table").fetchdf()
df.dropna(subset=["embeddings"],inplace=True)

# Now, we load the data into an in-memory database. Notice that we specify beforehand since we need the embedding column (must be specified as a FLOAT[N] where N is specified) 
# for Vector Similarity Search using HNSW to work
schema = {
            "doc_id": "VARCHAR",
            "embeddings": "FLOAT[384]",
            "properties": "MAP(VARCHAR, VARCHAR)",
            "text_representation": "VARCHAR",
            "bbox": "DOUBLE[]",
            "shingles": "BIGINT[]",
            "type": "VARCHAR",
        }
in_memory_db = duckdb.connect(":default:")
in_memory_db.install_extension("vss")
in_memory_db.load_extension("vss")
in_memory_db.execute(f"""CREATE TABLE in_memory_table (doc_id {schema.get('doc_id')},
                      embeddings {schema.get('embeddings')}, properties {schema.get('properties')}, 
                      text_representation {schema.get('text_representation')}, bbox {schema.get('bbox')}, 
                      shingles {schema.get('shingles')}, type {schema.get('type')})"""
    )
in_memory_db.execute("""INSERT INTO in_memory_table SELECT * FROM df; 
                      CREATE INDEX in_memory_table_index ON in_memory_table USING HNSW(embeddings)""")

<duckdb.duckdb.DuckDBPyConnection at 0x3444a1c30>

In [39]:
# For queries, let's define an embedding function that helps us easily compare the two vector values and generate optimal results
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed(question):
    v = minilm.encode(question).tolist()
    return v



In [65]:
# Let us now see the performance of the ANN HNSW search using DuckDB 
q = "What automobile type is the most accident prone?"
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embed(q)}::FLOAT[384]) LIMIT 10;")

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 7ddbb68b-c69a-4b24…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 7e8e91e8-cf2a-468b…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ 7ddbb68b-c69a-4b24…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 7e8e91e8-cf2a-468b…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ 7ddbb68b-c69a-4b24…  │ The Nat

In [66]:
# Let us now see the performance of the ANN HNSW search using DuckDB 
q = "Traffic Statistics"
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embed(q)}::FLOAT[384]) LIMIT 10;")

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 3c82fabc-9f99-4dfe…  │ Airport Informatio…  │ {score=0.7310920357704163, page_numbers=[5], page_number=5, path=s3:…  │
│ 8f3d0214-9de3-48c3…  │ Airport Informatio…  │ {score=0.702387809753418, page_numbers=[5], page_number=5, path=s3:/…  │
│ 1c6083df-00a8-484e…  │ Airport Informatio…  │ {score=0.6993858218193054, page_numbers=[5], page_number=5, path=s3:…  │
│ b10638a0-2ea2-4d23…  │ Airport Informatio…  │ {score=0.7263772487640381, page_numbers=[5], page_number=5, path=s3:…  │
│ c0594fed-b67a-44b9…  │ Airport

In [67]:
# We can also see the performance of the DuckDB query and check the index is being used by using the in-built visualization below 
q = "What automobile type is the most accident prone?"
in_memory_db.sql(f"EXPLAIN SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embed(q)}::FLOAT[384]) LIMIT 10")

┌───────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  explain_key  │                                            explain_value                                             │
│    varchar    │                                               varchar                                                │
├───────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ physical_plan │ ┌───────────────────────────┐\n│         PROJECTION        │\n│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │\n│    …  │
└───────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [69]:
# Finally we can build an index with other distance metrics, the default is Euclidean Distance ('l2sq'), with Cosine similarity ('cosine', 'array_cosine_simarlity') 
# and Inner Product ('ip', 'array_inner_product') also supported
in_memory_db.execute("CREATE INDEX in_memory_ip_index ON in_memory_table USING HNSW(embeddings) WITH (metric = 'ip')")
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_inner_product(embeddings, {embed(q)}::FLOAT[384]) LIMIT 10;")

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 7ddbb68b-c69a-4b24…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 7e8e91e8-cf2a-468b…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ 7ddbb68b-c69a-4b24…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 7e8e91e8-cf2a-468b…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ 7ddbb68b-c69a-4b24…  │ The Nat

In [None]:
# OPTIONAL: If you would like to remove your database after running the code above
import os
try:
    os.unlink(persistent_db)
except Exception as e:
    print(f"Error deleting {persistent_db}: {e}")