In [1]:
import os 
from langchain.text_splitter import ( CharacterTextSplitter,
RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter,
TextSplitter,TokenTextSplitter)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
current_dir = "./"
file_path = "./frankenstein.txt"
db_dir = os.path.join(current_dir,"db")

In [4]:
# Check if file exists 
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file {file_path} does not exist."
    )

In [6]:
# Read the content from the file 
loader = TextLoader(file_path, encoding="utf8")
documents = loader.load()

In [7]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Defining the embedding
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [8]:
# Function to create and persist vertor store
def create_vector_store(docs,store_name):
    persistent_directory = os.path.join(db_dir,store_name)
    if not os.path.exists(persistent_directory):
        print(f"\n --- Creating vector store {store_name} ---")
        db = Chroma.from_documents(
            docs , embeddings, persist_directory=persistent_directory
        )
        print(f" --- Finished creating vector store {store_name} ---")
    else:
        print(
            f" Vector store {store_name} already exists."
        )

 1. Character based Splitting

In [9]:
# 1. Character based Splitting
# Splits text into chunks based on a specified number of characters
# Useful for consistent chunks sizes regardless of content structure

print("\n -- Using Chracter-based Spliiting --")
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
create_vector_store(char_docs,"chroma_db_char")

Created a chunk of size 2200, which is longer than the specified 1000
Created a chunk of size 1782, which is longer than the specified 1000
Created a chunk of size 2255, which is longer than the specified 1000
Created a chunk of size 1060, which is longer than the specified 1000
Created a chunk of size 1497, which is longer than the specified 1000
Created a chunk of size 1243, which is longer than the specified 1000
Created a chunk of size 1493, which is longer than the specified 1000
Created a chunk of size 1112, which is longer than the specified 1000
Created a chunk of size 1614, which is longer than the specified 1000
Created a chunk of size 1201, which is longer than the specified 1000
Created a chunk of size 1201, which is longer than the specified 1000
Created a chunk of size 1105, which is longer than the specified 1000
Created a chunk of size 1216, which is longer than the specified 1000
Created a chunk of size 1339, which is longer than the specified 1000
Created a chunk of s


 -- Using Chracter-based Spliiting --

 --- Creating vector store chroma_db_char ---
 --- Finished creating vector store chroma_db_char ---


2. Sentence-based Splitting

In [10]:
# Splits text into chunks based on sentences, ensuring chunks end at
# sentence boundaries. Ideal for maintaining semantic coherence within chunk
print("\n -- Using sentence based Splitting -- ")
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000)
sent_docs = sent_splitter.split_documents(documents)
create_vector_store(sent_docs,"chroma_db_sent")


 -- Using sentence based Splitting -- 


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 --- Creating vector store chroma_db_sent ---
 --- Finished creating vector store chroma_db_sent ---


3. Token-based Splitting

In [12]:
# Splits text into chunks based on tokens (words/sub-words)
# Useful for transformer model with strict token limits
print("\n -- Token based Splitting -- ")
token_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=56)
token_docs = token_splitter.split_documents(documents)
create_vector_store(token_docs,"chroma_db_token")


 -- Token based Splitting -- 

 --- Creating vector store chroma_db_token ---
 --- Finished creating vector store chroma_db_token ---


 4. Recursive Character-based Splitting

In [13]:
# Attempts to split text at natural boundaries (sentences/paragraphs) within character limit
# Balances b/w maintaining coherence and adhering to character limit
print("\n -- Recursive Character based Splitting --")
rec_char_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
create_vector_store(rec_char_docs,"chroma_db_rec_char")


 -- Recursive Character based Splitting --

 --- Creating vector store chroma_db_rec_char ---
 --- Finished creating vector store chroma_db_rec_char ---


 5. Custom Splitting

In [15]:
# Allows creating custom splitting logic.
# Useful for documents with unique structure that standard splitters can't handle
print("\n -- Using Custom Splitting -- ")
class CustomTextSplitter(TextSplitter):
    def split_text(self,text):
        return text.split("\n\n")


custom_splitter = CustomTextSplitter()
custome_docs = custom_splitter.split_documents(documents)
create_vector_store(custome_docs,"chroma_db_custom")


 -- Using Custom Splitting -- 

 --- Creating vector store chroma_db_custom ---
 --- Finished creating vector store chroma_db_custom ---


In [22]:
# Function to query a vector store
def query_vector_store(store_name,query):
    persistent_directory = os.path.join(db_dir,store_name)
    if os.path.exists(persistent_directory):
        print(f"\n -- Querying the Vector Store {store_name} -- ")
        db = Chroma(
            persist_directory=persistent_directory, 
            embedding_function=embeddings
        )
        retriever = db.as_retriever(
            search_type = "similarity",
            search_kwargs={"k":5}
        )
        relavant_docs = retriever.invoke(query)
        #Results
        print(f"\n -- Relevant Documents for {store_name} -- ")
        for i, doc in enumerate(relavant_docs,1):
            print(f"Document {i}:\n{doc.page_content}\n")
            if doc.metadata:
                print(f"Source : {doc.metadata.get('source','Unknown')}\n")
    else:
        print(f"Vector store {store_name} does  not exists")

In [23]:
query = "How did Juliet die?"


In [24]:
query_vector_store("chroma_db_char", query)


 -- Querying the Vector Store chroma_db_char -- 

 -- Relevant Documents for chroma_db_char -- 
Document 1:
Thus the poor sufferer tried to comfort others and herself. She indeed
gained the resignation she desired. But I, the true murderer, felt the
never-dying worm alive in my bosom, which allowed of no hope or
consolation. Elizabeth also wept and was unhappy, but hers also was
the misery of innocence, which, like a cloud that passes over the fair
moon, for a while hides but cannot tarnish its brightness. Anguish and
despair had penetrated into the core of my heart; I bore a hell within
me which nothing could extinguish. We stayed several hours with
Justine, and it was with great difficulty that Elizabeth could tear
herself away. “I wish,” cried she, “that I were to die with you; I
cannot live in this world of misery.”

Source : ./frankenstein.txt

Document 2:
Justine assumed an air of cheerfulness, while she with difficulty
repressed her bitter tears. She embraced Elizabeth and said

In [25]:
query_vector_store("chroma_db_custom", query)


 -- Querying the Vector Store chroma_db_custom -- 

 -- Relevant Documents for chroma_db_custom -- 
Document 1:
Thus the poor sufferer tried to comfort others and herself. She indeed
gained the resignation she desired. But I, the true murderer, felt the
never-dying worm alive in my bosom, which allowed of no hope or
consolation. Elizabeth also wept and was unhappy, but hers also was
the misery of innocence, which, like a cloud that passes over the fair
moon, for a while hides but cannot tarnish its brightness. Anguish and
despair had penetrated into the core of my heart; I bore a hell within
me which nothing could extinguish. We stayed several hours with
Justine, and it was with great difficulty that Elizabeth could tear
herself away. “I wish,” cried she, “that I were to die with you; I
cannot live in this world of misery.”

Source : ./frankenstein.txt

Document 2:
Justine shook her head mournfully. “I do not fear to die,” she said;
“that pang is past. God raises my weakness and gives