In [1]:
import sys
sys.path.append('../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

from load_document import load_document
from document_db import DocumentDB

In [2]:
embedding = OpenAIEmbeddings()

vectorstore = Chroma(
                persist_directory="../data/document_db/state_of_the_union",
                embedding_function=embedding,
            )

db = DocumentDB("../data/document_db", vectorstore=vectorstore)
retriever = db.as_retriever()

In [3]:
docs = load_document("./files/state_of_the_union.txt", chunk_it=True, chunk_size=1000, chunk_overlap=100)

In [4]:
db.clean()

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 43}

In [5]:
db.upsert_documents(docs)

{'num_added': 43, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [None]:
results = retriever.invoke("Judge Ketanji Brown Jackson")
results[0].page_content

In [None]:
docs[10].page_content="modified content"

In [None]:
db.upsert_documents(docs)

In [6]:
db.delete_documents(['./files/state_of_the_union.txt', 'non existing document'])

[Document(page_content='Deleted DO NOT USE', metadata={'source': './files/state_of_the_union.txt'})]


{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 43}

In [7]:
db.delete_documents('this is not a list')

[]


{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}