In [1]:
import sys
sys.path.append('../')

from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(), override=True)

from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

from load_document import load_document
from document_db import DocumentDB

In [11]:
embedding = OpenAIEmbeddings()
vectorstore = Chroma(
                persist_directory="document_db",
                embedding_function=embedding,
            )
db = DocumentDB("document_db", vectorstore=vectorstore)
retriever = db.as_retriever()

In [3]:
docs1 = load_document("./files/state_of_the_union.txt", chunk_it=True, chunk_size=1000, chunk_overlap=0)
docs2 = load_document("./files/us_constitution.pdf", chunk_it=True, chunk_size=1000, chunk_overlap=0)

In [4]:
db.clean()

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [5]:
db.upsert_documents(docs1)

{'num_added': 42, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [6]:
db.upsert_documents(docs2)

{'num_added': 68, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [7]:
docs1[10]

Document(page_content='Vice President Harris and I ran for office with a new economic vision for America. \n\nInvest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up  \nand the middle out, not from the top down.  \n\nBecause we know that when the middle class grows, the poor have a ladder up and the wealthy do very well. \n\nAmerica used to have the best roads, bridges, and airports on Earth. \n\nNow our infrastructure is ranked 13th in the world. \n\nWe won’t be able to compete for the jobs of the 21st Century if we don’t fix that. \n\nThat’s why it was so important to pass the Bipartisan Infrastructure Law—the most sweeping investment to rebuild America in history. \n\nThis was a bipartisan effort, and I want to thank the members of both parties who worked to make it happen. \n\nWe’re done talking about infrastructure weeks. \n\nWe’re going to have an infrastructure decade.', metadata={'source': './files/state_of_the_union.txt'})

In [8]:
docs1[10].page_content="modified content"

In [9]:
db.upsert_documents(docs1)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 41, 'num_deleted': 1}

In [10]:
db.delete_documents(['./files/state_of_the_union.txt'])

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 42}

In [13]:
retriever.get_relevant_documents("state of the union")

[Document(page_content='government\nof\nthe\nUnited\nStates,\ndirected\nto\nthe\nPresident\nof\nthe\nSenate;\n--\nThe\nPresident\nof\nthe\nSenate\nshall,\nin\nthe\npresence\nof\nthe\nSenate\nand\nHouse', metadata={'page': 25, 'source': './files/us_constitution.pdf'}),
 Document(page_content='States,\nshall\nbe\nsuf ficient\nfor\nthe\nEstablishment\nof\nthis\nConstitution\nbetween\nthe\nStates\nso\nratifying\nthe\nSame.', metadata={'page': 21, 'source': './files/us_constitution.pdf'}),
 Document(page_content='latter\nwritten\ndeclaration,\nor ,\nif\nCongress\nis\nnot\nin\nsession,\nwithin\ntwenty-one\ndays\nafter\nCongress\nis\nrequired\nto\nassemble,\ndetermines\nby\ntwo-thirds\nvote\nof\nboth\nHouses\nthat\nthe\nPresident\nis\nunable\nto\ndischarge\nthe\npowers\nand\nduties\nof\nhis\nof fice,\nthe\nV ice\nPresident\nshall\ncontinue\nto\ndischarge\nthe\nsame\nas\nActing\nPresident;\notherwise,\nthe\nPresident\nshall\nresume\nthe\npowers\nand\nduties\nof\nhis\nof fice.', metadata={'page