In [1]:
import os 
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

In [2]:
current_dir = os.path.dirname(os.path.abspath("__file__"))
file_path = os.path.join(current_dir, "documents", "lord_of_the_rings.txt")
persistent_directory = os.path.join(current_dir, "db", "chroma_db")

In [3]:
if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")

    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The file {file_path} does not exist. Please check the path."
        )
    
    loader = TextLoader(file_path)
    documents = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)

    print("\n ---- Document Chunks Information ----")
    print(f"Number of documents chunks: {len(docs)}")
    print(f"Sample chunk: \n {docs[0].page_content}\n")
    
    # create embeddings
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

    # Create the vector store and persist it automatically

else:
    print("Vector store already initialized")

Created a chunk of size 1619, which is longer than the specified 1000
Created a chunk of size 1315, which is longer than the specified 1000
Created a chunk of size 1058, which is longer than the specified 1000
Created a chunk of size 1343, which is longer than the specified 1000
Created a chunk of size 1329, which is longer than the specified 1000
Created a chunk of size 1991, which is longer than the specified 1000
Created a chunk of size 1414, which is longer than the specified 1000
Created a chunk of size 1103, which is longer than the specified 1000
Created a chunk of size 1198, which is longer than the specified 1000
Created a chunk of size 1232, which is longer than the specified 1000
Created a chunk of size 1195, which is longer than the specified 1000
Created a chunk of size 1045, which is longer than the specified 1000
Created a chunk of size 1503, which is longer than the specified 1000
Created a chunk of size 1349, which is longer than the specified 1000
Created a chunk of s

Persistent directory does not exist. Initializing vector store...

 ---- Document Chunks Information ----
Number of documents chunks: 43
Sample chunk: 
 This book is largely concerned with Hobbits, and from its pages a
reader may discover much of their character and a little of their
history. Further information will also be found in the selection from
the Red Book of Westmarch that has already been published, under
the title of The Hobbit. That story was derived from the earlier chapters of the Red Book, composed by Bilbo himself, the first Hobbit
to become famous in the world at large, and called by him There and
Back Again, since they told of his journey into the East and his return:
an adventure which later involved all the Hobbits in the great events
of that Age that are here related.

The story of the Ring is, of course, known to many, but what most do not realize is that the power of the Ring is subtle and its evil grows with time. That is why the burden falls upon Frodo Baggins

In [9]:
docs

[Document(metadata={'source': '/home/abhishek/Documents/ML/huggingface_basics/medical_chatbot/documents/lord_of_the_rings.txt'}, page_content='This book is largely concerned with Hobbits, and from its pages a\nreader may discover much of their character and a little of their\nhistory. Further information will also be found in the selection from\nthe Red Book of Westmarch that has already been published, under\nthe title of The Hobbit. That story was derived from the earlier chapters of the Red Book, composed by Bilbo himself, the first Hobbit\nto become famous in the world at large, and called by him There and\nBack Again, since they told of his journey into the East and his return:\nan adventure which later involved all the Hobbits in the great events\nof that Age that are here related.\n\nThe story of the Ring is, of course, known to many, but what most do not realize is that the power of the Ring is subtle and its evil grows with time. That is why the burden falls upon Frodo Baggins

In [5]:
embedded_docs = [
    {
        "metadata": doc.metadata,
        "embedding": hf_embeddings.embed_query(doc.page_content)
    }
    for doc in docs
]

In [12]:
for embedded_doc in embedded_docs:
    print("Metadata: ", embedded_doc["metadata"])
    print("Embeddings: ", embedded_doc["embedding"])

Metadata:  {'source': '/home/abhishek/Documents/ML/huggingface_basics/medical_chatbot/documents/lord_of_the_rings.txt'}
Embeddings:  [0.03666343167424202, -0.04299798235297203, -0.01390075497329235, -0.012846742756664753, -0.04110097512602806, 0.037890009582042694, -0.020233366638422012, -0.018063412979245186, 0.05828366428613663, -0.019310101866722107, 0.04552534967660904, 0.04836831986904144, -0.023791352286934853, -0.03437338024377823, 0.013329814188182354, -0.07061606645584106, -0.03437747806310654, -0.042863037437200546, -0.01582980528473854, 0.010573741048574448, 0.016649533063173294, 0.031004054471850395, -0.012759371660649776, 0.014673160389065742, -0.01861305721104145, -0.030547956004738808, -0.025599787011742592, 0.046552874147892, -0.03287836164236069, -0.06762333959341049, 0.013369446620345116, -0.03768245503306389, 0.03246753662824631, -0.02129591815173626, 2.328670461793081e-06, -0.021577361971139908, 0.023137707263231277, 0.02646579034626484, -0.020076381042599678, -0.06

In [30]:
# Storing the data in Vector Database ie chroma
vector_store = Chroma.from_documents(docs, embedding=hf_embeddings)
query = "where did fordo meet gandalf?"
# results = vector_store.similarity_search(query, k=2)

# for result in results:
#     print("Source: ", result.metadata["source"])
#     print("Content: ", result.page_content)

retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k":3, "score_threshold":0.},
)

relavant_docs = retriever.invoke(query)

for i, doc in enumerate(relavant_docs,1):
    print(f"Document {i}:\n{doc.page_content}")
    if doc.metadata:
        print(f"Source: {doc.metadata.get("source", "unknown")}")

No relevant docs were retrieved using the relevance score threshold 0.5
