In [13]:
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage , AIMessage , HumanMessage

In [7]:
load_dotenv()

True

In [3]:
current_dir = os.getcwd()
books_dir = os.path.join(current_dir, "documents")  # Folder with .txt files
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

In [4]:
print(f"Books folder: {books_dir}")
print(f"Chroma DB folder: {persistent_directory}")

Books folder: c:\Users\ahmad\OneDrive\Desktop\Langchian AI Agents\4_Rag\documents
Chroma DB folder: c:\Users\ahmad\OneDrive\Desktop\Langchian AI Agents\4_Rag\db\chroma_db_with_metadata


In [8]:
if not os.path.exists(persistent_directory):
    print("Vector store not found. Initializing")

    # Make sure the text file directory exists
    if not os.path.exists(books_dir):
        raise FileNotFoundError(f"Missing folder: {books_dir}")

    # Load all .txt files from the folder
    book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]
    documents = []

    for book_file in book_files:
        file_path = os.path.join(books_dir, book_file)
        loader = TextLoader(file_path, encoding="utf-8")
        docs = loader.load()
        for doc in docs:
            doc.metadata = {"source": book_file}  # Add filename as source
            documents.append(doc)

    # Step 5: Split the text into smaller chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents)
    print(f"\nTotal text chunks created: {len(docs)}")

    # Step 6: Create embeddings
    print("\nGenerating embeddings...")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    print("\nEmbeddings ready.")

    # Step 7: Save everything into a Chroma vector store
    print("Saving documents to vector store...")
    db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
    print("\nVector store saved successfully!")

else:
    print("\nVector store already exists. Skipping initialization.")

Created a chunk of size 1184, which is longer than the specified 1000
Created a chunk of size 1045, which is longer than the specified 1000
Created a chunk of size 1132, which is longer than the specified 1000
Created a chunk of size 1674, which is longer than the specified 1000
Created a chunk of size 1610, which is longer than the specified 1000
Created a chunk of size 1562, which is longer than the specified 1000
Created a chunk of size 1063, which is longer than the specified 1000
Created a chunk of size 1543, which is longer than the specified 1000
Created a chunk of size 2597, which is longer than the specified 1000
Created a chunk of size 2613, which is longer than the specified 1000
Created a chunk of size 1079, which is longer than the specified 1000
Created a chunk of size 1251, which is longer than the specified 1000
Created a chunk of size 1534, which is longer than the specified 1000
Created a chunk of size 1323, which is longer than the specified 1000
Created a chunk of s

Vector store not found. Initializing


Created a chunk of size 1120, which is longer than the specified 1000
Created a chunk of size 1338, which is longer than the specified 1000
Created a chunk of size 1202, which is longer than the specified 1000
Created a chunk of size 2962, which is longer than the specified 1000
Created a chunk of size 1176, which is longer than the specified 1000
Created a chunk of size 1005, which is longer than the specified 1000
Created a chunk of size 2145, which is longer than the specified 1000
Created a chunk of size 1656, which is longer than the specified 1000
Created a chunk of size 1149, which is longer than the specified 1000
Created a chunk of size 2411, which is longer than the specified 1000
Created a chunk of size 1697, which is longer than the specified 1000
Created a chunk of size 1560, which is longer than the specified 1000
Created a chunk of size 1606, which is longer than the specified 1000
Created a chunk of size 1210, which is longer than the specified 1000
Created a chunk of s


Total text chunks created: 1724

Generating embeddings...

Embeddings ready.
Saving documents to vector store...

Vector store saved successfully!


# **Part 2**

In [9]:
current_dir = os.getcwd()  # Works in Jupyter and scripts
db_path = os.path.join(current_dir, "db", "chroma_db_with_metadata")

In [10]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
db = Chroma(persist_directory=db_path, embedding_function=embeddings)

In [11]:
query = "Where is Dracula's castle located?"

# 5. Set up the retriever to search for similar documents
retriever = db.as_retriever(
    search_type="similarity_score_threshold",  # Only return results above a certain score
    search_kwargs={"k": 3, "score_threshold": 0.2}  # Top 3 results with at least 0.2 similarity
)

# 6. Get relevant documents
results = retriever.invoke(query)

# 7. Display the results
print("\nRelevant Documents Found")
for i, doc in enumerate(results, 1):
    print(f"\nDocument {i}:")
    print(doc.page_content)
    print("Source:", doc.metadata.get("source", "Unknown"))


Relevant Documents Found

Document 1:
Having had some time at my disposal when in London, I had visited the
British Museum, and made search among the books and maps in the library
regarding Transylvania; it had struck me that some foreknowledge of the
country could hardly fail to have some importance in dealing with a
nobleman of that country. I find that the district he named is in the
extreme east of the country, just on the borders of three states,
Transylvania, Moldavia and Bukovina, in the midst of the Carpathian
mountains; one of the wildest and least known portions of Europe. I was
not able to light on any map or work giving the exact locality of the
Castle Dracula, as there are no maps of this country as yet to compare
with our own Ordnance Survey maps; but I found that Bistritz, the post
town named by Count Dracula, is a fairly well-known place. I shall enter
here some of my notes, as they may refresh my memory when I talk over my
travels with Mina.
Source: Dracula.txt

Docum