## MultiVectorDocumentDB


In [None]:
import os
import sys
sys.path.append('../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from load_document import load_document
from multi_embedding_document_db import MultiEmbeddingDocumentDB

#### Load files and  and split them into chunks. This chunks will be our parent documents.

In [None]:
docs = load_document("./files/state_of_the_union.txt", chunk_it=True, chunk_size=4000)

In [None]:
len(docs)

In [None]:
len(docs[0].page_content)

In [None]:
from langchain.docstore.document import Document

docs2 = [Document(doc.page_content, metadata=doc.metadata.copy()) for doc in docs]

docs == docs2

#### Initialize the document database

In [None]:
data_folder = os.path.abspath("../data/multi_embedding_document_db")

db = MultiEmbeddingDocumentDB(
        data_folder,
        functor=[("chunk", {"chunk_size":400}), "summary", ("question", {"q":2})]
    )

**`upsert`** inserts documents into the database, ignoring existing documents and deleting outdated versions

In [None]:
db.upsert_documents(docs)

The parent documents are stored in the `docstore` asociated to the `vectorstore` used by the data base. The generator **`yield_keys`** returns the ids of the parent documents in the docstore.

In [None]:
ids = list(db.vectorstore.docstore.yield_keys())

len(ids)

And the method **`get_by_ids`** returns the list of parent documents associated with a list of ids

In [None]:
doc = db.vectorstore.get_by_ids([ids[0]])

len(doc[0].page_content)

In [None]:
print(doc[0].page_content[:200])

The method **`get_child_ids`** returns the ids of the childs of a parent document.

In [None]:
child_ids = db.vectorstore.get_child_ids(ids[0])

len(child_ids)

In [None]:
child_docs = db.vectorstore.similarity_search("", k=100, filter={"id": ids[0]})

len(child_docs)

In [None]:
len(child_docs[0].page_content)

In [None]:
print(child_docs[0].page_content[:200])

Note that upserting updated documents only inserts the modified content and deletes the outdated content

In [None]:
for doc1, doc2 in zip(docs, docs2):
    assert doc1.page_content == doc2.page_content
    doc1.metadata.pop("id")
    assert doc1.metadata == doc2.metadata

In [None]:
docs[0].page_content = docs[0].page_content.upper()

In [None]:
db.upsert_documents(docs)

In [None]:
docs[0].metadata

#### Retrieval

**`as_retriever`** returns a retriever that can be used to query the database for documents

In [None]:
child_docs = db.vectorstore.similarity_search("justice breyer", k=5)

In [None]:
len(child_docs)

In [None]:
len(child_docs[0].page_content)

In [None]:
print(child_docs[0].page_content[:250])

In [None]:
# k is the number of child docs to retrieve and used to identify the parent docs to return
retriever = db.as_retriever(k=5)

In [None]:
related_docs = retriever.invoke("justice breyer")

In [None]:
len(related_docs)

In [None]:
len(related_docs[0].page_content)

In [None]:
print(related_docs[0].page_content[2250:2750])

In [None]:
db.delete_index()