In [50]:
#imports 
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings

import os
import shutil
import argparse

In [45]:

DATA_PATH = "data"
CHROMA_PATH = "chroma"
OPENAI_API = "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"

In [46]:
def split_text(documents : list[Document]):
    textsplitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index = True
    )   
    chunks = textsplitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks")
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)
    return chunks

In [47]:

def save_to_chroma(chunks:list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(openai_api_key=OPENAI_API),persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")



def load_documents():
    loader = DirectoryLoader(DATA_PATH,glob="*.txt")
    documents = loader.load()
    return documents

In [48]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [49]:
generate_data_store()

Split 1 documents into 688 chunks
Mrs. Anthon was fast unwinding her philosophy of life, in the sympathetic manner of Western Americans, that takes for granted a neighbour’s interest in one’s affairs and does not comprehend reticence. Wilbur was apparently interested. But Miss Anthon, who had practised the power of watching ever for her mother’s garrulous tongue, while she attended to other matters, interfered.

“Mr. Erard will show us his den, mamma. Isn’t the apartment delightful and interesting? It’s an old swell’s house. _Louis seize_ complete, just as it was, without any change. Mr. Erard found it quite by accident, he says, one day when he was wandering about in this quarter among the convents. He came down a side lane that runs into the rue Vaugirard. Just as he was leaving it, his eye happened to fall upon that old cypress in the court. He prowled about and found this nest.”
{'source': 'data\\book.txt', 'start_index': 5736}
Saved 688 chunks to chroma


In [51]:
parser = argparse.ArgumentParser()
parser.add_argument("query_text",type=str,help="The Query Text")
args = parser.parse_args()
query_text = args.query_text

embedding_function = OpenAIEmbeddings(openai_api_key=OPENAI_API)
db = Chroma(persist_directory=CHROMA_PATH,embedding_function=embedding_function)

results = db.similarity_search_with_relevance_scores(query_text,k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results")

context_text = "\n\n---\n\n".join([doc.page_content for doc,_score in results])
print(context_text)

usage: ipykernel_launcher.py [-h] query_text
ipykernel_launcher.py: error: the following arguments are required: query_text


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
