The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.

In [None]:
import pyarrow.fs
import sycamore
import json
from sycamore.functions.tokenizer import HuggingFaceTokenizer
from sycamore.llms import OpenAIModels, OpenAI
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import MarkedMerger
from sycamore.transforms.partition import ArynPartitioner
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.embed import SentenceTransformerEmbedder

In [None]:
# Define a function used for cleaning and formatting later on in the notebook

from sycamore.data.document import Document
from dateutil import parser
def convert_timestamp(doc: Document) -> Document:
    if "dateAndTime" not in doc.properties['entity'] and "dateTime" not in doc.properties['entity']:
        return doc
    raw_date: str = doc.properties['entity'].get('dateAndTime') or doc.properties['entity'].get('dateTime')
    raw_date = raw_date.replace("Local", "")
    parsed_date = parser.parse(raw_date, fuzzy=True)
    extracted_date = parsed_date.date()
    doc.properties['entity']['day'] = extracted_date.day
    doc.properties['entity']['month'] = extracted_date.month
    doc.properties['entity']['year'] = extracted_date.year
    if parsed_date.utcoffset():
        doc.properties['entity']['dateTime'] = parsed_date.isoformat()
    else:
        doc.properties['entity']['dateTime'] = parsed_date.isoformat() + "Z"
    
    return doc

In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
paths = ["s3://aryn-public/ntsb/"]
fsys = pyarrow.fs.S3FileSystem(region="us-east-1", anonymous=True)

llm = OpenAI(OpenAIModels.GPT_3_5_TURBO.value)
tokenizer = HuggingFaceTokenizer("thenlper/gte-small")

ctx = sycamore.init()

ds = (
    ctx.read.binary(paths, binary_format="pdf", filesystem=fsys)
    # Partition with the Aryn partitioner remotely, pulling out tables and images.
    .partition(partitioner=ArynPartitioner(extract_images=True,  extract_table_structure=True))
    # Get rid of spurious whitespace charaters
    .regex_replace(COALESCE_WHITESPACE)
    # Automatically determine a schema of additional metadata to extract from Documents
    .extract_batch_schema(schema_extractor=OpenAISchemaExtractor("FlightAccidentReport", llm=llm, num_of_elements=35))
    # Extract the metadata specified by that schema
    .extract_properties(property_extractor=OpenAIPropertyExtractor(llm=llm, num_of_elements=35))
    # Merge elements into larger chunks
    .mark_bbox_preset(tokenizer=tokenizer)
    .merge(merger=MarkedMerger())
    # Convert extracted timestamps to better-structured form using the function above
    .map(convert_timestamp)
    # Copy document properties to each Document's sub-elements
    .spread_properties(["path", "entity"])
    # Split elements that are too big to embed
    .split_elements(tokenizer=tokenizer, max_tokens=512)
    # Convert all Elements to Documents
    .explode()
    # Generate a series of hashes to represent each Document. For use with near-duplicate detection
    .sketch()
    # Embed each Document
    .embed(embedder=SentenceTransformerEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=100))
)

In [None]:
# Write to a persistent DuckDB database
# - into a specific database (as specified by url) 
# - into a specific table (as specified by table name)
persistent_db = "demo.db"
persistent_table = "demo_table"
ds.write.duckdb(
    db_url=persistent_db,
    table_name=persistent_table,
    dimensions=384
)

In [None]:
# We connect to the DuckDB to perform operations
import duckdb
data_conn = duckdb.connect("demo.db")

In [None]:
# DuckDB's efficient Vector Similarity Search on persistent databases is still an experimental feature (https://duckdb.org/docs/extensions/vss.html),
# so we load our data into an in-memory database. Once this feature is more stable, we can query the persistent database directly using VSS.

# Load from disk into Pandas Dataframe as an intermediate step
df = data_conn.execute("SELECT * FROM demo_table").fetchdf()
df.dropna(subset=["embeddings"],inplace=True)


# Now, we load the data into an in-memory database, to help with faster computation

in_memory_db = duckdb.connect(":default:")
in_memory_db.install_extension("vss")
in_memory_db.load_extension("vss")

# To allow RAG to work properly on this database using Langchain (shown later), we must rename the properties column to metadata, 
# add a 'source' key within it, and convert its datatype to string. Note that the value 'default' can be modified to other options if needed
new_key = 'source'
new_value = 'default'
df['properties'] = df['properties'].apply(lambda x: dict(zip(x['key'], x['value'])))
df['properties'] = df['properties'].apply(lambda x: {**x, new_key: new_value}).apply(lambda x: json.dumps(x))
df = df.rename(columns={'properties': 'metadata'})

# Notice that we specify beforehand since we need the 'metadata' column (must be specified as of type FLOAT[N] where N is specified) 
# for Vector Similarity Search in DuckDB to work.

schema = {
            "doc_id": "VARCHAR",
            "embeddings": "FLOAT[384]",
            "metadata": "VARCHAR",
            "text_representation": "VARCHAR",
            "bbox": "DOUBLE[]",
            "shingles": "BIGINT[]",
            "type": "VARCHAR",
        }
in_memory_db.execute(f"""CREATE TABLE in_memory_table (doc_id {schema.get('doc_id')},
                      embeddings {schema.get('embeddings')}, metadata {schema.get('metadata')}, 
                      text_representation {schema.get('text_representation')}, bbox {schema.get('bbox')}, 
                      shingles {schema.get('shingles')}, type {schema.get('type')})"""
    )
in_memory_db.execute("""INSERT INTO in_memory_table SELECT * FROM df; 
                      CREATE INDEX in_memory_table_index ON in_memory_table USING HNSW(embeddings)""")

In [None]:
# For queries, let's define an embedding function for the question that helps us easily run NN search 
# by comparing the two vectors and generating optimal results
from sentence_transformers import SentenceTransformer
minilm = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

class Embedder():
    def __init__(self, llm):
        self.llm = llm
    
    def embed_query(self, question):
        v = self.llm.encode(question).tolist()
        return v

embedder = Embedder(minilm)

In [None]:
# Use DuckDB's ANN HNSW semantic search for retrieval. We use Array Euclidean Distance as the nearest neighbor lookup, and pretty print the result
from tabulate import tabulate

q = "What automobile type is the most accident prone?"
result = in_memory_db.sql(f"SELECT doc_id, text_representation, metadata FROM in_memory_table ORDER BY array_distance(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10;")

# Convert the result to a list of lists
table_data = result.fetchall()

# Get column names
headers = [desc[0] for desc in result.description]

# Pretty print the table
print(tabulate(table_data, headers=headers, tablefmt="grid"))

In [None]:
# View the first two rows of the text output to better understand similarity search results

first_two = table_data[2:4]

print("Text representation of the first two values (not including headers):")
for i, row in enumerate(first_two, 1):
    print(f"\n{i}. doc_id: {row[0]}")
    print(f"   Text representation:\n   {row[1]}")
    print("-" * 50)

In [None]:
# We now initialize a vector store on DuckDB using the LangChain integration. We perform similarity search using the integration here
from langchain_community.vectorstores.duckdb import DuckDB
vector_store = DuckDB(connection=in_memory_db, embedding=embedder, id_key= "doc_id", text_key="text_representation", vector_key="embeddings", table_name="in_memory_table")
result = vector_store.similarity_search('traffic')
print(result)

In [None]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQAWithSourcesChain  
import os
# Finally, we initialize a RAG agent and ask the model a question about the data
llm = ChatOpenAI(  
    openai_api_key=os.environ.get("OPENAI_API_KEY"),  
    model_name='gpt-3.5-turbo',  
    temperature=0.8  
)  
qa = RetrievalQAWithSourcesChain.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vector_store.as_retriever() , verbose=True
)  
qa.invoke({"question": "How many accidents happened?"})

In [None]:
# We can also see the performance of the DuckDB ANN query and check the index is being used by using the in-built visualization below 
q = "What automobile type is the most accident prone?"
explain_result = in_memory_db.sql(f"EXPLAIN SELECT doc_id, text_representation, metadata FROM in_memory_table ORDER BY array_distance(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10")

# Convert the result to a list of lists
explain_data = explain_result.fetchall()

# Get column names
headers = [desc[0] for desc in explain_result.description]

# Pretty print the EXPLAIN output
print("Expanded EXPLAIN output:")
print(tabulate(explain_data, headers=headers, tablefmt="grid"))

In [None]:
# To experiment with other distance metrics, one can set the 'metric' value as done below
# The default is Euclidean Distance ('l2sq'), with Cosine similarity ('cosine', 'array_cosine_simarlity') 
# and Inner Product ('ip', 'array_inner_product') also supported
in_memory_db.execute("CREATE INDEX in_memory_ip_index ON in_memory_table USING HNSW(embeddings) WITH (metric = 'ip')")
in_memory_db.sql(f"SELECT doc_id, text_representation, metadata FROM in_memory_table ORDER BY array_inner_product(embeddings, {embedder.embed_query(q)}::FLOAT[384]) LIMIT 10;")

In [None]:
# OPTIONAL: If you would like to remove your database after running the code above, please run this code cell
try:
    os.unlink(persistent_db)
except Exception as e:
    print(f"Error deleting {persistent_db}: {e}")