In [1]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

def get_embedding_function():
     
    model_name = "BAAI/bge-base-en"
    encode_kwargs = {'normalize_embeddings': True}   
    
    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        encode_kwargs=encode_kwargs
    )
    return embeddings

In [2]:
import dotenv
dotenv.load_dotenv()
import argparse
import os
import shutil
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_chroma import Chroma
from unstructured.partition.auto import partition


CHROMA_PATH = "./chroma1"
DATA_PATH = r"C:\Users\Aryan\Documents\Litmus\project\learning_system_data\uploadedfiles"


def populate_db():
     
    print(f"Script working directory: {os.getcwd()}")
    
    
    documents = load_documents()
    print(f"Loaded {len(documents)} document chunks")
    
  
    chunks = split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    
     
    add_to_chroma(chunks)


def load_documents():
    """Load documents from PDF files using Unstructured.io OCR."""
    documents = []

    for filename in os.listdir(DATA_PATH):
        if filename.endswith(".pdf"):
            print(f"Processing file: {filename}")
            file_path = Path(DATA_PATH) / filename
            
            try:
               
                text_content = extract_text_from_pdf(file_path)
                
                if text_content.strip():  # Only add if there's actual content
                    documents.append(
                        Document(
                            page_content=text_content,
                            metadata={
                                "source": str(file_path),
                                "filename": filename,
                                "extraction_method": "unstructured_ocr"
                            }
                        )
                    )
                else:
                    print(f"No text content extracted from {filename}")
                    
            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

    return documents


def extract_text_from_pdf(file_path):
    
    file_path = Path(file_path)
    
    if not file_path.exists() or not file_path.is_file():
        raise ValueError(f"File {file_path} does not exist or is not a valid file.")
    
    try:
        print(f"Extracting text from: {file_path}")
        
       
        elements = partition(
            filename=str(file_path),
            ocr_languages=["eng"],  
            strategy="hi_res", 
            infer_table_structure=True,
        )
        
       
        text_content = ""
        for element in elements:
          
            element_text = str(element).strip()
            if element_text:
                text_content += element_text + "\n\n"
        
        print(f"Successfully extracted {len(text_content)} characters from {file_path.name}")
        return text_content.strip()
        
    except Exception as e:
        print(f"Error extracting text from {file_path}: {str(e)}")
        raise


def split_documents(documents: list[Document]):
    """Split documents into smaller chunks for better processing."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def add_to_chroma(chunks: list[Document]):
    """Add document chunks to ChromaDB."""
 
    db = Chroma(
        persist_directory=CHROMA_PATH, 
        embedding_function=get_embedding_function()
    )


    chunks_with_ids = calculate_chunk_ids(chunks)

   
    existing_items = db.get(include=[]) 
    existing_ids = set(existing_items["ids"])
    print(f"Existing IDs in DB: {len(existing_ids)}")
    print(f"Number of existing documents in DB: {len(existing_ids)}")

   
    new_chunks = []
    print("Processing chunks for addition:")
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
            print(f"New chunk ID: {chunk.metadata['id']}")

    if len(new_chunks):
        print(f" Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        print("Documents added successfully")
    else:
        print(" No new documents to add")


def calculate_chunk_ids(chunks):
    """
    Calculate unique IDs for each chunk.
    Format: filename:chunk_index
    """
    chunk_counter = {}
    
    for chunk in chunks:
        source = chunk.metadata.get("source")
        filename = chunk.metadata.get("filename", "unknown")
        
        
        if filename not in chunk_counter:
            chunk_counter[filename] = 0
        else:
            chunk_counter[filename] += 1
        
      
        chunk_id = f"{filename}:{chunk_counter[filename]}"
        chunk.metadata["id"] = chunk_id

    return chunks


def clear_database():
    """Clear the ChromaDB database."""
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print("Database cleared successfully")

populate_db()




The ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages instead.
Only one of languages and ocr_languages should be specified. languages is preferred. ocr_languages is marked for deprecation.


Script working directory: c:\Users\Aryan\Documents\Litmus\project\testing
Processing file: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf
Extracting text from: C:\Users\Aryan\Documents\Litmus\project\learning_system_data\uploadedfiles\POP UP TOASTER  Model PT 3720_207075_User Manual.pdf


  from .autonotebook import tqdm as notebook_tqdm
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Successfully extracted 54407 characters from POP UP TOASTER  Model PT 3720_207075_User Manual.pdf
Loaded 1 document chunks
Split into 75 chunks


  embeddings = HuggingFaceBgeEmbeddings(


Existing IDs in DB: 0
Number of existing documents in DB: 0
Processing chunks for addition:
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:0
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:1
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:2
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:3
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:4
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:5
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:6
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:7
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:8
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:9
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:10
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:11
New chunk ID: POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:12
New chun

In [20]:
import argparse
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

CHROMA_PATH = "chroma1"
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
---
Answer the question based on the above context: {question}
"""


def query_rag(query_text: str):
    
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
     
    results = db.similarity_search_with_score(query_text, k=5)
    print(results)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    
  
    model = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",  
        temperature=0.9,
        convert_system_message_to_human=True
    )
    
    response_text = model.invoke(prompt)
    
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

response  = query_rag("how to operate usha pop up toaster?")

[(Document(id='POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:10', metadata={'filename': 'POP UP TOASTER  Model PT 3720_207075_User Manual.pdf', 'id': 'POP UP TOASTER  Model PT 3720_207075_User Manual.pdf:10', 'source': 'C:\\Users\\Aryan\\Documents\\Litmus\\project\\learning_system_data\\uploadedfiles\\POP UP TOASTER  Model PT 3720_207075_User Manual.pdf', 'extraction_method': 'unstructured_ocr'}, page_content='are older than 8 and supervised.\n\nYou are now the proud owner of USHA Pop Up Toaster.\n\n4.\n\nCAUTION: During operation the appliance heats up/becomes hot and it is therefore\n\nrecommend avoid touching the metallic parts of the appliance with bare hands. Use handles\n\nInnovatively designed, using only the highest quality material and components, your USHA Pop Up Toaster is designed for years of trouble free performance.\n\nor knobs only.\n\n5.\n\nBefore storing the appliance let it cool down. Then pull the plug from the socket.\n\n6.\n\nTo protect against electrical s



Response: content='Based on the provided context, the manual describes the following operational aspects:\n\n*   **Variable browning control:** The color controller knob allows the user to get the desired crispiness of toast, with up to 7 levels.\n*   **Mid cycle cancel function:** This function allows the user to withdraw the bread being toasted from the slots at any point during the toasting process.\n*   **Bread type:** The bread slot is only applicable for toasting regular bread slices; irregular or round bread cannot be toasted.\n\nThe context does not provide explicit step-by-step instructions on the basic operation, such as how to start the toaster after placing bread or how the toast is retrieved when finished (besides the mid-cycle cancel).' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []} id='run--3883cd75-46a8-4dbb-a7d4-a785dfb87a1a-0' usage_m

In [25]:
print("Response Content:")

print(response.content)



Response Content:
Based on the provided context, the manual describes the following operational aspects:

*   **Variable browning control:** The color controller knob allows the user to get the desired crispiness of toast, with up to 7 levels.
*   **Mid cycle cancel function:** This function allows the user to withdraw the bread being toasted from the slots at any point during the toasting process.
*   **Bread type:** The bread slot is only applicable for toasting regular bread slices; irregular or round bread cannot be toasted.

The context does not provide explicit step-by-step instructions on the basic operation, such as how to start the toaster after placing bread or how the toast is retrieved when finished (besides the mid-cycle cancel).
