In [1]:
import pyarrow.fs
from ray.data import ActorPoolStrategy
import sycamore
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import SycamorePartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [2]:
from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [3]:
paths = ["s3://aryn-public/ntsb/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Parition with the sycamore partitioner, pulling out tables and images. ActorPoolStrategy(size=3) works best on my particular hardware
    # but your mileage may vary depending on your RAM.
    .partition(partitioner=SycamorePartitioner(extract_table_structure=True, extract_images=True), compute=ActorPoolStrategy(size=3))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each document. For use with near-duplicate detection
    .sketch()
    # Embed each document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
)

In [5]:
# Write to a persistent duckdb database
# - into a specific database (as specified by url) 
# - into a specific table (as specified by table name)
persistent_db = "demo.db"
persistent_table = "demo_table"
ds.write.duckdb(
    db_url=persistent_db,
    table_name=persistent_table,
    dimensions=384
)

2024-07-03 12:18:10,034	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-07-03_11-59-37_726179_83869/logs/ray-data
2024-07-03 12:18:10,035	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> ActorPoolMapOperator[ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)] -> ActorPoolMapOperator[MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract)] -> ActorPoolMapOperator[MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_drop_header_and_footer)->MapBatches(mark_break_page)->MapBatches(mark_break_by_column)->MapBatches(mark_break_by_tokens)->MapBatches(merge_elements)->MapBatches(convert_timestamp)->MapBatches(spread_properties)->MapBatches(split_doc)->MapBatches(explode)->MapBatches(sketcher)->MapBatches(SentenceTransformerEmbedder)->MapBatches(BaseMapTransformCallable__duckdb_write_documents)]


- ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 1:   0%|          | 0/…

- MapBatches(regex_replace)->MapBatches(BaseMapTransformCustom__Extract) 2:   0%|          | 0/65 [00:00<?, ?i…

- MapBatches(extract_properties)->MapBatches(sort_by_page_bbox)->MapBatches(mark_drop_tiny)->MapBatches(mark_d…

Running 0:   0%|          | 0/65 [00:00<?, ?it/s]

2024-07-03 12:19:02,925	ERROR streaming_executor_state.py:449 -- An exception was raised from a task of operator "ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)". Dataset execution will now abort. To ignore this exception and continue, set DataContext.max_errored_blocks.


RayTaskError(UserCodeException): [36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)()[39m (pid=84061, ip=127.0.0.1, actor_id=28a9b823448a0bf0e9635e7e01000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)))
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/detr_partitioner.py", line 120, in partition_pdf
    return self._partition_pdf_sequenced(
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/detr_partitioner.py", line 196, in _partition_pdf_sequenced
    assert len(pdfminer_layout) == len(deformable_layout)
AssertionError

The above exception was the direct cause of the following exception:

[36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)()[39m (pid=84061, ip=127.0.0.1, actor_id=28a9b823448a0bf0e9635e7e01000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)))
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 110, in fn
    return ray.data._cached_fn(item, *fn_args, **fn_kwargs)
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/util.py", line 78, in __call__
    return future.result()
  File "/Users/karansampath/miniforge3/lib/python3.10/concurrent/futures/_base.py", line 458, in result
    return self.__get_result()
  File "/Users/karansampath/miniforge3/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
  File "/Users/karansampath/miniforge3/lib/python3.10/concurrent/futures/thread.py", line 58, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/base.py", line 193, in ray_callable
    return BaseMapTransform._process_ray(ray_input, name, lambda d: f(d, *args, **kwargs), enable_auto_metadata)
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/base.py", line 230, in _process_ray
    outputs = f(docs)
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/base.py", line 193, in <lambda>
    return BaseMapTransform._process_ray(ray_input, name, lambda d: f(d, *args, **kwargs), enable_auto_metadata)
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/map.py", line 49, in _wrap
    return [f(d, *args, **kwargs) for d in docs]
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/map.py", line 49, in <listcomp>
    return [f(d, *args, **kwargs) for d in docs]
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/utils/time_trace.py", line 136, in wrapper
    return f(*args, **kwargs)
  File "/Users/karansampath/Desktop/Repos/sycamore/lib/sycamore/sycamore/transforms/partition.py", line 483, in partition
    raise RuntimeError(f"SycamorePartitioner Error processing {path}") from e
RuntimeError: SycamorePartitioner Error processing s3://aryn-public/ntsb/1.pdf

The above exception was the direct cause of the following exception:

[36mray::ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)()[39m (pid=84061, ip=127.0.0.1, actor_id=28a9b823448a0bf0e9635e7e01000000, repr=MapWorker(ReadBinary->Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)))
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/operators/actor_pool_map_operator.py", line 403, in submit
    yield from _map_task(
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 419, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 392, in __call__
    for data in iter:
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 134, in _udf_timed_iter
    output = next(input)
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 236, in __call__
    yield from self._batch_fn(input, ctx)
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 208, in transform_fn
    res = fn(batch)
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 112, in fn
    _handle_debugger_exception(e)
  File "/Users/karansampath/Library/Caches/pypoetry/virtualenvs/sycamore-ai--zTjaFUY-py3.10/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 143, in _handle_debugger_exception
    raise UserCodeException() from e
ray.exceptions.UserCodeException

In [5]:
# We connect to the DuckDB to perform operations
import duckdb
data_conn = duckdb.connect("demo.db")

In [38]:
# DuckDB is not production ready yet for efficient Vector Similarity Search on persistent databases (https://duckdb.org/docs/extensions/vss.html), 
# so we load our data into an in-memory database for this demo. In future versions, once DuckDB productionizes this feature, there will be no more need for this code cell

# Load from disk into Pandas Dataframe as an intermediate step
df = data_conn.execute("SELECT * FROM demo_table").fetchdf()
df.dropna(subset=["embeddings"],inplace=True)

# Now, we load the data into an in-memory database. Notice that we specify beforehand since we need the embedding column (must be specified as a FLOAT[N] where N is specified) 
# for Vector Similarity Search using HNSW to work
schema = {
            "doc_id": "VARCHAR",
            "embeddings": "FLOAT[384]",
            "properties": "MAP(VARCHAR, VARCHAR)",
            "text_representation": "VARCHAR",
            "bbox": "DOUBLE[]",
            "shingles": "BIGINT[]",
            "type": "VARCHAR",
        }
in_memory_db = duckdb.connect(":default:")
# in_memory_db.install_extension("vss")
# in_memory_db.load_extension("vss")
in_memory_db.execute(f"""CREATE TABLE in_memory_table (doc_id {schema.get('doc_id')},
                      embeddings {schema.get('embeddings')}, properties {schema.get('properties')}, 
                      text_representation {schema.get('text_representation')}, bbox {schema.get('bbox')}, 
                      shingles {schema.get('shingles')}, type {schema.get('type')})"""
    )
in_memory_db.execute("""INSERT INTO in_memory_table SELECT * FROM df; 
                      CREATE INDEX in_memory_table_index ON in_memory_table USING HNSW(embeddings)""")

ConnectionException: Connection Error: Connection has already been closed

In [18]:
# For queries, let's define an embedding function for the question that helps us easily compare the two vectors and generate optimal results // Run NN search
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

class Embedder():
    def __init__(self, llm):
        self.llm = llm
    
    def embed_query(self, question):
        v = self.llm.encode(question).tolist()
        return v

embedder = Embedder(minilm)



In [9]:
# Let us now see the performance of the ANN HNSW search using DuckDB // array_distance is the nn lookup
q = "What automobile type is the most accident prone?"
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10;")
# Pretty print tables
# text representation --> print one cell out (of the top result). 

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 73f87ba9-ff4e-4d75…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 06b31736-b0ed-4c57…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ d701d8a7-b5fa-4a8f…  │ The National Trans…  │ {score=0.7743222713470459, page_numbers=[1], page_number=1, path=s3:…  │
│ 36e85e87-85b7-4261…  │ The National Trans…  │ {score=0.7248827219009399, page_numbers=[1], page_number=1, path=s3:…  │
│ 0918cf95-4cbb-452c…  │ The Nat

In [66]:
# Let us now see the performance of the ANN HNSW search using DuckDB 
q = "Traffic Statistics"
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10;")

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 3c82fabc-9f99-4dfe…  │ Airport Informatio…  │ {score=0.7310920357704163, page_numbers=[5], page_number=5, path=s3:…  │
│ 8f3d0214-9de3-48c3…  │ Airport Informatio…  │ {score=0.702387809753418, page_numbers=[5], page_number=5, path=s3:/…  │
│ 1c6083df-00a8-484e…  │ Airport Informatio…  │ {score=0.6993858218193054, page_numbers=[5], page_number=5, path=s3:…  │
│ b10638a0-2ea2-4d23…  │ Airport Informatio…  │ {score=0.7263772487640381, page_numbers=[5], page_number=5, path=s3:…  │
│ c0594fed-b67a-44b9…  │ Airport

In [67]:
# We can also see the performance of the DuckDB query and check the index is being used by using the in-built visualization below 
q = "What automobile type is the most accident prone?"
in_memory_db.sql(f"EXPLAIN SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_distance(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10")

┌───────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  explain_key  │                                            explain_value                                             │
│    varchar    │                                               varchar                                                │
├───────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ physical_plan │ ┌───────────────────────────┐\n│         PROJECTION        │\n│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │\n│    …  │
└───────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [10]:
# Finally we can build an index with other distance metrics, the default is Euclidean Distance ('l2sq'), with Cosine similarity ('cosine', 'array_cosine_simarlity') 
# and Inner Product ('ip', 'array_inner_product') also supported
in_memory_db.execute("CREATE INDEX in_memory_ip_index ON in_memory_table USING HNSW(embeddings) WITH (metric = 'ip')")
in_memory_db.sql(f"SELECT doc_id, text_representation, properties FROM in_memory_table ORDER BY array_inner_product(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10;")

┌──────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────┐
│        doc_id        │ text_representation  │                               properties                               │
│       varchar        │       varchar        │                         map(varchar, varchar)                          │
├──────────────────────┼──────────────────────┼────────────────────────────────────────────────────────────────────────┤
│ 73f87ba9-ff4e-4d75…  │ The National Trans…  │ {score=0.8915606737136841, page_numbers=[1], page_number=1, path=s3:…  │
│ 06b31736-b0ed-4c57…  │ The National Trans…  │ {score=0.8774372339248657, page_numbers=[1], page_number=1, path=s3:…  │
│ d701d8a7-b5fa-4a8f…  │ The National Trans…  │ {score=0.7743222713470459, page_numbers=[1], page_number=1, path=s3:…  │
│ 36e85e87-85b7-4261…  │ The National Trans…  │ {score=0.7248827219009399, page_numbers=[1], page_number=1, path=s3:…  │
│ 0918cf95-4cbb-452c…  │ The Nat

In [27]:
in_memory_db.sql(f"ALTER TABLE in_memory_table ADD COLUMN metadata VARCHAR[]")

CatalogException: Catalog Error: Column with name metadata already exists!

In [28]:
# conn = duckdb.connect(database=':memory:',
#     config={
#         # Sample configuration to restrict some DuckDB capabilities
#         # List is not exhaustive. Please review DuckDB documentation.
#             "enable_external_access": "false",
#             "autoinstall_known_extensions": "false",
#             "autoload_known_extensions": "false"
#         }
# )
from langchain_community.vectorstores.duckdb import DuckDB
from langchain_openai import OpenAIEmbeddings
vector_store = DuckDB(connection=in_memory_db, embedding=embedder, id_key= "doc_id", text_key="text_representation", vector_key="embeddings", table_name="in_memory_table")
# vector_store.add_texts(['text1', 'text2'])
result = vector_store.similarity_search('traffic')

TypeError: the JSON object must be str, bytes or bytearray, not float

In [39]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQAWithSourcesChain  
import os
# completion llm  
llm = ChatOpenAI(  
    openai_api_key=os.environ.get("OPENAI_API_KEY"),  
    model_name='gpt-3.5-turbo',  
    temperature=0.0  
)  
qa = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vector_store.as_retriever()  
)  
qa.run(q)

  warn_deprecated(


ValueError: `run` not supported when there is not exactly one output key. Got ['answer', 'sources'].

In [None]:
# OPTIONAL: If you would like to remove your database after running the code above
try:
    os.unlink(persistent_db)
except Exception as e:
    print(f"Error deleting {persistent_db}: {e}")