In [None]:
#!pip install langchain-huggingface

In [2]:
import os
import shutil
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_mistralai import ChatMistralAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from transformers import AutoTokenizer,AutoModelForCausalLM
from mistralai import Mistral
import chromadb
import gc
import time
from langchain.chat_models import ChatOpenAI
from langchain_huggingface import HuggingFacePipeline
import gradio as gr

In [3]:
load_dotenv()

True

In [4]:
api_key=os.environ["mistral_api_key"]
client=Mistral(api_key=api_key)

In [5]:
SCRIPT_DIR = os.path.dirname(os.path.abspath("Finance_Bill_259(RAG).pdf"))
DATA_DIR=os.path.join(SCRIPT_DIR, "Finance_Bill_259(RAG).pdf")
VECTOR_DB_DIR=os.path.join(SCRIPT_DIR,"chroma_db")
#avoiding hardcoding absolute paths that might break on other machines if you move your script by use of script_dir and os.path.join for code portability

if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"PDF file not found at: {DATA_DIR}")
if not os.path.exists(VECTOR_DB_DIR):
    os.makedirs(VECTOR_DB_DIR)

print(f"DATA_DIR:",{DATA_DIR})
print(f"VECTOR_DB_DIR:",{VECTOR_DB_DIR})



DATA_DIR: {'C:\\Users\\USER\\Documents\\Lux_assign01\\RAG2_ss\\Finance_Bill_259(RAG).pdf'}
VECTOR_DB_DIR: {'C:\\Users\\USER\\Documents\\Lux_assign01\\RAG2_ss\\chroma_db'}


In [6]:
EMBEDDING_MAX_TOKENS = 256
CHUNK_SIZE_TOKENS = 200 # A bit less than 256 to give buffer
CHUNK_OVERLAP_TOKENS = 50 

In [7]:

def clear_vector_store(directory, max_attempts=3, delay=2):
    for attempt in range(max_attempts):
        try:
            # Ensure no open connections
            gc.collect()
            if os.path.exists(directory):
                shutil.rmtree(directory)
                print(f"Cleared directory: {directory}")
            return
        except PermissionError as e:
            print(f"Attempt {attempt + 1}/{max_attempts} failed: {e}")
            time.sleep(delay)
    raise PermissionError(f"Failed to clear directory {directory} after {max_attempts} attempts")

In [8]:
def ingest_data():
    print(f"Loading documents from {DATA_DIR}...")
    if not os.path.exists(DATA_DIR):
        print(f"PDF file not found at {DATA_DIR}")
        raise FileNotFoundError(f"PDF not found at {DATA_DIR}")
    
    try:
        loader=PyPDFLoader(DATA_DIR)
        documents=loader.load()
        print(f"loaded {len(documents)} documents successfuly...")
    except Exception as e:
        print(f"Failed to load PDF: {e}")
        raise

    print("Splitting documents into chunks...")#splitting into chunks
    #token-based splitting to respct model limits

    model_name="mistralai/Mistral-7B-Instruct-v0.2"
    tokenizer= AutoTokenizer.from_pretrained(model_name,token=os.getenv("hf_token_key"))    
    
    def token_length(text):
        return len(tokenizer.encode(text,add_special_tokens=True, truncation=False))#defining token length based on the embedding tokenizer
        
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE_TOKENS, #max size of each chunk
        chunk_overlap=CHUNK_OVERLAP_TOKENS, #overlap between chunks to maintain context
        length_function=token_length, #custom fxn
        is_separator_regex=False, #standard separators
    )

    chunks=text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks sucessfully.")
    
    #creating embeddings and vectordb storage.
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={"device":"cpu"},#were GPU available, "cuda" would be well-suited
                                       encode_kwargs={"normalize_embeddings":True,
                                                        "truncate": True, #ensure truncation if a chunk somehow still exceeds
                                                        "max_length":EMBEDDING_MAX_TOKENS,
                                                        "batch_size":32})#truncate to 512 texts
    
    print("Creating embeddings and instoring to ChromaDB...")

    #creating ChromaDB vector store to load the database in the specified directory
    os.makedirs(VECTOR_DB_DIR,exist_ok=True)

    try:
        valid_chunks=[]
        for chunk in chunks:
            tokens=token_length(chunk.page_content)
            if tokens>EMBEDDING_MAX_TOKENS:
                print(f"Truncating chunk with {tokens} tokens")
                chunk.page_content=tokenizer.decode(tokenizer.encode(chunk.page_content,max_length=256,truncation=True))
            valid_chunks.append(chunk)

        print(f"Number of valid chunks: {len(valid_chunks)}")

        chroma_client = chromadb.PersistentClient(
            path=VECTOR_DB_DIR
        )

        vectorstore=Chroma.from_documents(
            documents=chunks,
            embedding=embeddings,
            persist_directory=VECTOR_DB_DIR,
            client=chroma_client
        )

        print(f"Embeddings stored in {VECTOR_DB_DIR}")
        print(f"Number of documents in vector store is:",vectorstore._collection.count())

        if vectorstore._collection.count()==0:
            print("No documents stored in ChromaDB")
            raise ValueError("Ingestion failed:No documents stored!")        
        
    except Exception as e:
        print(f"Failed to store embeddings:{e}")
        raise

    if vectorstore._collection.count()>0:
        print(f"Ingestion step successful to {VECTOR_DB_DIR} directory successfully...")
    else:
        print("Ingestion step failed to store embeddings.ERROR!")

In [9]:


VECTOR_DB_DIR="chroma_db/" #directory where the ChromaDB is stored.

def setup_rag_chain():
    print(f"Loading vector db from {VECTOR_DB_DIR}...")

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={"device":"cpu"},
                                       encode_kwargs={"normalize_embeddings":True,
                                                        "truncate": True, 
                                                        "max_length":EMBEDDING_MAX_TOKENS,
                                                        "batch_size":32})
    
    vectorstore=Chroma(
        embedding_function=embeddings,
        persist_directory=VECTOR_DB_DIR
    )#to load the existing chromaDB,note the chunks and their embeddings from the chroma

    count=vectorstore._collection.count()
    print(f"Vector store contains {count} documents.")

    if count==0:
        print("Vector store is empty! Ingest data first.")
        raise ValueError("Chroma database is empty!!!.")
    
    retriever=vectorstore.as_retriever(search_kwargs={"k":3})#retrieves the top n relevant chunks.

    print("Initializing LLM...")#now initializing the llm
    llm=ChatMistralAI(model="mistral-small-latest",api_key=os.getenv("mistral_api_key"),
        max_tokens=200)
    
    # This is the formatting function
    format_docs = (lambda docs: "\n\n".join(doc.page_content for doc in docs))

    #prompt engineering
    template=("""I am an AI assistant that answers your questions based off of the The Proposed Financ Bill 25/26.
        context:{context}
        question:{question}
        Answer:
        """)
    prompt=ChatPromptTemplate.from_template(template)#this' the prompt dor context and question

    rag_chain=(
        {"context":retriever|format_docs,
         "question":RunnablePassthrough()}
         |prompt
         |llm
         |StrOutputParser()
    )
    print("RAG Chain setup complete!")
    return rag_chain

rag_chain=None#Global variable to store the RAG chain

def gradio_chat(user_input,history):
    try:
        response=rag_chain.invoke(user_input)
        return response
    except Exception as e:
        error_msg=(f"Error:{str(e)}")
        return error_msg
    

In [None]:


if __name__=="__main__":  
    #clear vector storage
    try:
        print("Clearing vector db storage...")
        clear_vector_store(directory="C:/Users/USER/Documents/Lux_assign01/RAG2_ss/chroma_db")
    except Exception as e:
        print(f"Error clearind vector db storage: {e}.")
        raise
    
    #run ingestion
    try:
        print("Running ingestion...")
        ingest_data()
    except Exception as e:
        print(f"Error ingesting data: {e}")
        raise

    #run RAG chain
    try:
        rag_chain=setup_rag_chain()
    except Exception as e:
        print(f"Setup Error: {e}.!!!")
        raise

    ## Step 4: Launching Gradio Chat
    print("Launching Gradio chat UI...")
    gr.ChatInterface(fn=gradio_chat,
                     title="📘 Finance Bill Chatbot",
                     description="Ask questions about the Finance Bill 2025/2026.",
                     chatbot=gr.Chatbot(height=400),
                     theme="default").launch(share=True)


Clearing vector db storage...
Cleared directory: C:/Users/USER/Documents/Lux_assign01/RAG2_ss/chroma_db
Running ingestion...
Loading documents from C:\Users\USER\Documents\Lux_assign01\RAG2_ss\Finance_Bill_259(RAG).pdf...
loaded 135 documents successfuly...
Splitting documents into chunks...
Split into 616 chunks sucessfully.
Creating embeddings and instoring to ChromaDB...
Number of valid chunks: 616


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Embeddings stored in chroma_db/
Number of documents in vector store is: 616
Ingestion step successful to chroma_db/ directory successfully...
Loading vector db from chroma_db/...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  chatbot=gr.Chatbot(height=400),


Vector store contains 616 documents.
Initializing LLM...
RAG Chain setup complete!
Launching Gradio chat UI...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://43c302a8c6f5d823ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given
