## ADD new document into VectoreStore

In [6]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [15]:
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [17]:
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size=chunk_size,
        self.chunk_overlap=chunk_overlap,
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],

        )

    def process_pdf(self,pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Laod PDF

        loader=PyPDFLoader(pdf_path)
        pages=loader.load()

        ## Process each page

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            ## clean text
            cleaned_text=self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

    
            


In [18]:
preprocessor=SmartPDFProcessor()
preprocessor

<__main__.SmartPDFProcessor at 0x1bea7962b50>

In [19]:
## Process a PDF if available
try:
    smart_chunks=preprocessor.process_pdf("../data/procedure.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata
    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 12 smart chunks

Sample chunk metadata:
  producer: pdfTeX-1.40.27
  creator: LaTeX with hyperref
  creationdate: 2026-02-19T10:44:37+00:00
  author: 
  keywords: 
  moddate: 2026-02-19T10:44:37+00:00
  ptex.fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1
  subject: 
  title: 
  trapped: /False
  source: ../data/procedure.pdf
  total_pages: 6
  page: 1
  page_label: 1
  chunk_method: smart_pdf_processor
  char_count: 1185


In [20]:
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

persist_directory = ".././outpot_db"

# 1️⃣ Embeddings
embeddings = OpenAIEmbeddings()

# 2️⃣ Load or create vectorstore
try:
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings
    )
    print("✅ Vector store loaded.")
except Exception as e:
    print("⚠️ Creating new vector store...")
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings
    )
    vectorstore.persist()

# 3️⃣ add to vectore store
vectorstore.add_documents(smart_chunks)

# 6️⃣ Persist changes
vectorstore.persist()

print("✅ Chunks added successfully.")


✅ Vector store loaded.
✅ Chunks added successfully.


In [26]:
count = vectorstore._collection.count()
print(f"Number of vectors in database: {count}")


Number of vectors in database: 24


#### Start From scratch

In [27]:
import os
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables import RunnableSequence
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader

# Load documents from directory
dir_loader=DirectoryLoader(
    "../data",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True

)
documents = dir_loader.load()

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document preview:")
print(documents[0].page_content[:200] + "...")

100%|██████████| 11/11 [00:00<00:00, 74.42it/s]

Loaded 11 documents

First document preview:
ISO 13485:2016
Clause: 4.1.1
Title: General Quality Management System Requirements

SUMMARY:
Clause 4.1.1 requires the organization to establish, document, implement, and maintain a Quality Management...





In [29]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,  # Maximum size of each chunk
    chunk_overlap=150,  # Overlap between chunks to maintain context
    length_function=len,
    separators=[" "]  # Hierarchy of separators
)
chunks=text_splitter.split_documents(documents)

print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nChunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 11 chunks from 11 documents

Chunk example:
Content: ISO 13485:2016
Clause: 4.1.1
Title: General Quality Management System Requirements

SUMMARY:
Clause 4.1.1 requires the organization to establish, docu...
Metadata: {'source': '..\\data\\iso_13485\\4.1.1_general_qms_requirement.txt'}


In [30]:
sample_text="MAchine LEarning is fascinating"
embeddings=OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001BEA57F62E0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001BEA57F6370>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [32]:
## Create a Chromdb vector store
persist_directory=".././myDb"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"

)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 11 vectors
Persisted to: .././myDb


In [33]:
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""
    def __init__(self,chunk_size=1500,chunk_overlap=150):
        self.chunk_size=chunk_size,
        self.chunk_overlap=chunk_overlap,
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],

        )

    def process_pdf(self,pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Laod PDF

        loader=PyPDFLoader(pdf_path)
        pages=loader.load()

        ## Process each page

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            ## clean text
            cleaned_text=self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

    
            


In [34]:
preprocessor=SmartPDFProcessor()
preprocessor

<__main__.SmartPDFProcessor at 0x1beac334970>

In [35]:
## Process a PDF if available
try:
    smart_chunks=preprocessor.process_pdf("../data/procedure.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata
    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 8 smart chunks

Sample chunk metadata:
  producer: pdfTeX-1.40.27
  creator: LaTeX with hyperref
  creationdate: 2026-02-19T10:44:37+00:00
  author: 
  keywords: 
  moddate: 2026-02-19T10:44:37+00:00
  ptex.fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1
  subject: 
  title: 
  trapped: /False
  source: ../data/procedure.pdf
  total_pages: 6
  page: 1
  page_label: 1
  chunk_method: smart_pdf_processor
  char_count: 1185


In [39]:
smart_chunks

[Document(metadata={'producer': 'pdfTeX-1.40.27', 'creator': 'LaTeX with hyperref', 'creationdate': '2026-02-19T10:44:37+00:00', 'author': '', 'keywords': '', 'moddate': '2026-02-19T10:44:37+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.27 (TeX Live 2025) kpathsea version 6.4.1', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../data/procedure.pdf', 'total_pages': 6, 'page': 1, 'page_label': '1', 'chunk_method': 'smart_pdf_processor', 'char_count': 1185}, page_content='Procedure for Control of Documents and Records Document Control Information The first page of every controlled document shall include the mandatory document control information to ensure identification, traceability, and accountability. This document is a controlled document of the Quality Management System (QMS). It shall be used only in its current approved version. Each controlled document governed by this procedure shall clearly identify: • A unique Document Identification Code

In [40]:
chunks

[Document(metadata={'source': '..\\data\\iso_13485\\4.1.1_general_qms_requirement.txt'}, page_content='ISO 13485:2016\nClause: 4.1.1\nTitle: General Quality Management System Requirements\n\nSUMMARY:\nClause 4.1.1 requires the organization to establish, document, implement, and maintain a Quality Management System (QMS). The organization must define the QMS scope and clearly identify its regulatory role.\n\nREQUIREMENTS:\n- Define the scope and perimeter of the QMS\n- Identify and document the regulatory role of the organization (manufacturer, importer, distributor, etc.)\n- Establish and maintain a documented QMS compliant with ISO 13485 and applicable regulatory requirements\n\nMANDATORY PROCESS:\nProcess Name: QMS Governance and Scope Definition\nProcess Description:\nThis process ensures that the organization formally defines its QMS scope, regulatory responsibilities, and establishes a documented system aligned with ISO 13485 requirements.\n\nREQUIRED DOCUMENTS:\n- Quality Manual\

In [None]:
## Create a Chromdb vector store
persist_directory=".././outpot_db"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=[chunks,smart_chunks],
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"

)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 22 vectors
Persisted to: .././outpot_db


In [44]:
## Create a Chromdb vector store
persist_directory=".././chroma_db"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"

)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 11 vectors
Persisted to: .././chroma_db


In [47]:
count = vectorstore._collection.count()
print(f"Number of vectors in database: {count}")


Number of vectors in database: 19


In [60]:
smart_chunks[0].page_content

'Procedure for Control of Documents and Records Document Control Information The first page of every controlled document shall include the mandatory document control information to ensure identification, traceability, and accountability. This document is a controlled document of the Quality Management System (QMS). It shall be used only in its current approved version. Each controlled document governed by this procedure shall clearly identify: • A unique Document Identification Code (ID) with an established prefix according to the organization’s document coding system. •Document title. •Version or revision number. •Effective date. •Status (e.g., Draft, Approved, Obsolete). •Author. •Reviewer. •Approver. Revision Traceability Each document shall include a Revision History section indicating: •Version number •Date of revision •Description of changes introduced The Author, Reviewer, and Approver shall be clearly identified to ensure accountability throughout the document lifecycle. Only t

In [46]:
### Add new documents to vectorstore
vectorstore.add_documents(smart_chunks)



['026983ca-4912-4cb8-b1dc-f450bcd17b9e',
 '1636079e-b2ee-49ec-b975-9cef5a33664d',
 '579592c4-41ab-4ce6-a448-fb6cec12ea52',
 '38be7d73-6a4b-4ccf-ad2e-cee760e7c301',
 'f0f15914-4a13-44da-98bd-8998cfb69712',
 '4923f3f9-03c4-4fb1-952c-392acbf04ea8',
 '82f165eb-f1ad-413b-abc7-79c08862ae4e',
 '189073af-5714-4873-8c81-a44c81010f61']

### Build RAG

In [48]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [49]:
## Convert vector store to retriever
retriever=vectorstore.as_retriever(
    search_kwarg={"k":3} ## Retrieve top 3 relevant chunks
)
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001BEA546CA30>, search_kwargs={})

In [54]:
from langchain.chat_models.base import init_chat_model

llm=init_chat_model("openai:gpt-3.5-turbo")
#llm=init_chat_model("groq:")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x000001BEAFE66E20>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001BEAFE6C430>, root_client=<openai.OpenAI object at 0x000001BEA47ECCD0>, root_async_client=<openai.AsyncOpenAI object at 0x000001BEAFE6C280>, model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)

In [51]:
system_prompt = """
You are a regulatory compliance assistant specialized in ISO 13485:2016 and Quality Management Systems (QMS).

You must answer strictly based on the retrieved context provided below.

Rules:
- Use ONLY the information found in the context.
- Do NOT add external knowledge.
- Do NOT assume missing information.
- If the answer is not explicitly available in the context, say:
  "The requested information is not available in the provided ISO 13485 context."
- Clearly reference the relevant ISO 13485 clause number in your answer.
- Use professional regulatory and compliance language suitable for regulatory documentation.
- Organize the answer in numbered points (1-, 2-, 3-, etc.).
- When appropriate, group information under clear headings (e.g., Summary, Requirements, Required Documents, Required Records, Responsibilities).
- Provide detailed explanations for each point to enhance understanding, but remain strictly within the context.
- Where possible, elaborate on implications, steps, or conditions described in the context, without introducing external knowledge.
- Highlight any critical compliance obligations, mandatory actions, or key considerations indicated in the context.
- Ensure that the answer is comprehensive, clear, and suitable for inclusion in QMS compliance reports.
- Expand on each relevant point sufficiently to help the reader fully understand the requirement or procedure.

Retrieved Context:
{context}
"""


In [52]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "Question: {input}")
])

In [55]:
### Create a document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain=create_stuff_documents_chain(llm,prompt)
document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nYou are a regulatory compliance assistant specialized in ISO 13485:2016 and Quality Management Systems (QMS).\n\nYou must answer strictly based on the retrieved context provided below.\n\nRules:\n- Use ONLY the information found in the context.\n- Do NOT add external knowledge.\n- Do NOT assume missing information.\n- If the answer is not explicitly available in the context, say:\n  "The requested information is not available in the provided ISO 13485 context."\n- Clearly reference the relevant ISO 13485 clause number in your answer.\n- Use professional regulatory and complian

In [56]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [57]:
### Create The Final RAG Chain
from langchain.chains import create_retrieval_chain
rag_chain=create_retrieval_chain(retriever,document_chain)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001BEA546CA30>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nYou are a regulatory compliance assistant specialized in ISO 13485:2016 and Quality Management Systems (QMS).\n\nYou must answer stric

In [58]:
response=rag_chain.invoke({"input":"what is requirements of General Quality Management System Requirements?"})

In [59]:
response['answer']

"Based on the retrieved context from ISO 13485:2016, specifically Clause 4.1.1 - General Quality Management System Requirements, the requirements are as follows:\n\n1- **Define the QMS Scope and Perimeter:**\n   - The organization must define the scope and boundaries of its Quality Management System (QMS). This includes identifying the processes, activities, and areas within the organization that fall under the QMS.\n  \n2- **Identify and Document Regulatory Role:**\n   - The organization is required to clearly identify and document its regulatory role, whether it is a manufacturer, importer, distributor, etc. This helps in understanding the specific regulatory responsibilities relevant to the organization's operations.\n  \n3- **Establish and Maintain a Documented QMS:**\n   - The organization must establish and maintain a documented Quality Management System that complies with ISO 13485 standards and any applicable regulatory requirements. This involves creating and implementing docu