In [7]:
# pip install --upgrade langchain langchain-community

In [2]:
# imports

import os
import glob
import chromadb
from sentence_transformers import SentenceTransformer
from docx import Document
import uuid
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
import plotly.graph_objects as go
import glob

In [3]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
DB = "agile_process"

In [4]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
class SentenceTransformerEmbeddings:
    def __init__(self, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
    
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """Embed a list of documents."""
        embeddings = self.model.encode(texts)
        return embeddings.tolist()
    
    def embed_query(self, text: str) -> list[float]:
        """Embed a single query text."""
        embedding = self.model.encode([text])
        return embedding[0].tolist()

def load_docx_content(file_path):
    """Load content from a DOCX file."""
    try:
        doc = Document(file_path)
        content = []
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():
                content.append(paragraph.text.strip())
        return '\n'.join(content)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return ""

def split_text(text, chunk_size=1000, chunk_overlap=200):
    """Simple text splitter."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - chunk_overlap
        if start >= len(text):
            break
    return chunks

# Load all DOCX files from the Documents folder
files = glob.glob("Documents/*.docx")
all_chunks = []
all_metadatas = []
all_ids = []

for file_path in files:
    doc_type = os.path.splitext(os.path.basename(file_path))[0]
    content = load_docx_content(file_path)
    
    if content:
        chunks = split_text(content)
        for i, chunk in enumerate(chunks):
            all_chunks.append(chunk)
            all_metadatas.append({
                "doc_type": doc_type,
                "file_path": file_path,
                "chunk_index": i
            })
            all_ids.append(f"{doc_type}_{i}_{uuid.uuid4().hex[:8]}")

print(f"Total number of chunks: {len(all_chunks)}")
print(f"Document types found: {set(meta['doc_type'] for meta in all_metadatas)}")

Total number of chunks: 17
Document types found: {'Process_Doc'}


In [6]:
# Initialize ChromaDB client
DB_PATH = "agile_process"
client = chromadb.PersistentClient(path=DB_PATH)

# Initialize embeddings
embeddings_model = SentenceTransformerEmbeddings('sentence-transformers/all-MiniLM-L6-v2')

# Delete existing collection if it exists
collection_name = "process_docs"
try:
    client.delete_collection(name=collection_name)
    print("Existing collection deleted.")
except Exception as e:
    print(f"Collection didn't exist or couldn't be deleted: {e}")

# Create a new collection
collection = client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"}  # Use cosine similarity
)

# Add documents to the collection in batches (ChromaDB has limits)
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
    batch_chunks = all_chunks[i:i + batch_size]
    batch_metadatas = all_metadatas[i:i + batch_size]
    batch_ids = all_ids[i:i + batch_size]
    
    # Generate embeddings for this batch
    batch_embeddings = embeddings_model.embed_documents(batch_chunks)
    
    # Add to collection
    collection.add(
        documents=batch_chunks,
        metadatas=batch_metadatas,
        ids=batch_ids,
        embeddings=batch_embeddings
    )
    
    print(f"Added batch {i//batch_size + 1}/{(len(all_chunks) + batch_size - 1)//batch_size}")

print(f"Vectorstore created with {collection.count()} documents")

# Example: Query the collection
def query_documents(query_text, n_results=5):
    """Query the document collection."""
    query_embedding = embeddings_model.embed_query(query_text)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )
    
    return results

# Test query (uncomment to test)
# results = query_documents("your query here")
# print("Query results:", results)

Collection didn't exist or couldn't be deleted: Collection [process_docs] does not exists
Added batch 1/1
Vectorstore created with 17 documents


In [None]:
import gradio as gr
import chromadb
from sentence_transformers import SentenceTransformer
import openai
from typing import List, Dict, Any
import json
import re

class MultiComponentRAG:
    def __init__(self, db_path: str = "./chroma_db", collection_name: str = "process_docs"):
        # Initialize ChromaDB
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_collection(name=collection_name)
        
        # Initialize embedding model
        self.embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        
        # Initialize OpenAI (replace with your preferred LLM)
        # openai.api_key = "your-api-key-here"
        
        # Cache for storing retrieved contexts
        self.context_cache = {}
    
    def retrieve_relevant_chunks(self, query: str, n_results: int = 10) -> Dict[str, Any]:
        """Retrieve relevant document chunks with metadata."""
        query_embedding = self.embeddings_model.encode([query]).tolist()
        
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results,
            include=["documents", "metadatas", "distances"]
        )
        
        return {
            "documents": results["documents"][0],
            "metadatas": results["metadatas"][0],
            "distances": results["distances"][0]
        }
    
    def get_multi_component_context(self, query: str, max_chunks: int = 15) -> str:
        """
        Retrieve and organize multiple document components for complex queries.
        """
        # Get initial relevant chunks
        results = self.retrieve_relevant_chunks(query, n_results=max_chunks)
        
        # Group chunks by document type
        doc_groups = {}
        for doc, metadata, distance in zip(results["documents"], results["metadatas"], results["distances"]):
            doc_type = metadata.get("doc_type", "unknown")
            if doc_type not in doc_groups:
                doc_groups[doc_type] = []
            doc_groups[doc_type].append({
                "content": doc,
                "metadata": metadata,
                "relevance_score": 1 - distance  # Convert distance to similarity
            })
        
        # Build comprehensive context
        context_parts = []
        context_parts.append("=== RELEVANT DOCUMENT SECTIONS ===\n")
        
        for doc_type, chunks in doc_groups.items():
            context_parts.append(f"\n--- FROM DOCUMENT: {doc_type.upper()} ---")
            
            # Sort chunks by relevance within each document
            chunks.sort(key=lambda x: x["relevance_score"], reverse=True)
            
            for i, chunk in enumerate(chunks[:5]):  # Limit chunks per document
                context_parts.append(f"\n[Chunk {i+1} - Relevance: {chunk['relevance_score']:.3f}]")
                context_parts.append(chunk["content"])
        
        return "\n".join(context_parts)
    
    def extract_key_concepts(self, query: str) -> List[str]:
        """Extract key concepts from query to perform multi-faceted retrieval."""
        # Simple keyword extraction (you could use more sophisticated NLP here)
        # Remove common words and extract key terms
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'what', 'how', 'when', 'where', 'why', 'who'}
        
        words = re.findall(r'\b\w+\b', query.lower())
        key_concepts = [word for word in words if len(word) > 3 and word not in stop_words]
        
        return key_concepts[:5]  # Return top 5 concepts
    
    def enhanced_retrieval(self, query: str) -> str:
        """
        Enhanced retrieval that looks for multiple components and related concepts.
        """
        contexts = []
        
        # 1. Direct query retrieval
        direct_context = self.get_multi_component_context(query, max_chunks=10)
        contexts.append(("Direct Query", direct_context))
        
        # 2. Key concept based retrieval
        key_concepts = self.extract_key_concepts(query)
        for concept in key_concepts:
            concept_context = self.get_multi_component_context(concept, max_chunks=5)
            contexts.append((f"Concept: {concept}", concept_context))
        
        # Combine all contexts
        final_context = "\n\n" + "="*50 + "\n"
        final_context += "COMPREHENSIVE DOCUMENT CONTEXT\n"
        final_context += "="*50 + "\n\n"
        
        for context_type, context_content in contexts:
            final_context += f"\n{'='*20} {context_type} {'='*20}\n"
            final_context += context_content
            final_context += "\n"
        
        return final_context
    
    def generate_response(self, query: str, context: str) -> str:
        """Generate response using LLM with retrieved context."""
        
        # Create a comprehensive prompt
        prompt = f"""You are an AI assistant that answers questions based on document content. 
        
Use the provided document context to answer the user's question comprehensively. 
If the question requires information from multiple parts of the documents, make sure to reference and synthesize information from different sections.

DOCUMENT CONTEXT:
{context}

USER QUESTION: {query}

INSTRUCTIONS:
1. Answer based primarily on the provided document context
2. If information spans multiple documents or sections, synthesize them clearly
3. Cite which document types or sections you're referencing
4. If the context doesn't contain sufficient information, say so clearly
5. Provide a comprehensive answer that addresses all aspects of the question

ANSWER:"""

        # For demo purposes, using a simple response
        # Replace this with actual LLM API call
        try:
            # Example with OpenAI (uncomment and configure)
            # response = openai.ChatCompletion.create(
            #     model="gpt-3.5-turbo",
            #     messages=[{"role": "user", "content": prompt}],
            #     max_tokens=1000,
            #     temperature=0.7
            # )
            # return response.choices[0].message.content
            
            # Placeholder response for demo
            return f"""Based on the retrieved document context, I can see information from multiple document sections:

**Key Points Found:**
- Document types referenced: {', '.join(set(re.findall(r'FROM DOCUMENT: (\w+)', context)))}
- Multiple relevant sections were found across different documents
- The query appears to require synthesis of information from various components

**Answer:** 
[This is where the actual LLM response would appear. The system has retrieved context from multiple document components as requested.]

**Sources Referenced:**
{context[:200]}... (truncated for display)

*Note: Replace this placeholder with actual LLM integration*"""
            
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def chat_interface(self, message: str, history: List[List[str]]) -> str:
        """Main chat interface for Gradio."""
        try:
            # Enhanced retrieval for multi-component queries
            context = self.enhanced_retrieval(message)
            
            # Generate response
            response = self.generate_response(message, context)
            
            return response
            
        except Exception as e:
            return f"Error processing query: {str(e)}"

# Initialize RAG system
rag_system = MultiComponentRAG()

# Create Gradio interface
def create_gradio_interface():
    with gr.Blocks(title="Multi-Component Document RAG System", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🤖 Multi-Component Document RAG System
        
        Ask questions that may require referencing multiple parts of your document database.
        The system will intelligently retrieve and synthesize information from various document components.
        
        **Examples of complex queries:**
        - "Compare the methodology sections across different documents"
        - "What are the common themes mentioned in all documents?"
        - "Summarize the key findings from multiple research papers"
        """)
        
        chatbot = gr.Chatbot(
            height=500,
            show_label=False,
            container=True,
            show_copy_button=True
        )
        
        msg = gr.Textbox(
            placeholder="Ask a question that might require multiple document components...",
            label="Your Question",
            lines=2
        )
        
        with gr.Row():
            submit_btn = gr.Button("Submit", variant="primary")
            clear_btn = gr.Button("Clear Chat", variant="secondary")
        
        # Advanced options
        with gr.Accordion("Advanced Options", open=False):
            max_chunks = gr.Slider(
                minimum=5,
                maximum=25,
                value=15,
                step=1,
                label="Maximum chunks to retrieve",
                info="Higher values provide more context but may be slower"
            )
            
            show_context = gr.Checkbox(
                label="Show retrieved context",
                value=False,
                info="Display the document context used for generating the response"
            )
        
        def respond(message, history, max_chunks_val, show_context_val):
            # Temporarily update max_chunks in the system
            original_method = rag_system.get_multi_component_context
            
            def modified_method(query, max_chunks=max_chunks_val):
                return original_method(query, max_chunks)
            
            rag_system.get_multi_component_context = modified_method
            
            # Get response
            bot_message = rag_system.chat_interface(message, history)
            
            # Add context if requested
            if show_context_val:
                context = rag_system.enhanced_retrieval(message)
                bot_message += f"\n\n--- RETRIEVED CONTEXT ---\n{context[:1000]}..."
            
            history.append([message, bot_message])
            return history, ""
        
        def clear_chat():
            return [], ""
        
        # Event handlers
        submit_btn.click(
            respond,
            inputs=[msg, chatbot, max_chunks, show_context],
            outputs=[chatbot, msg]
        )
        
        msg.submit(
            respond,
            inputs=[msg, chatbot, max_chunks, show_context],
            outputs=[chatbot, msg]
        )
        
        clear_btn.click(clear_chat, outputs=[chatbot, msg])
        
        # Example queries
        gr.Examples(
            examples=[
                "What are the main topics covered across all documents?",
                "Compare the methodologies mentioned in different documents",
                "Summarize the key findings from the research papers",
                "What recommendations are made in the policy documents?",
                "Are there any contradictions between different document sections?"
            ],
            inputs=msg
        )
    
    return demo

# Launch the interface
if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,  # Set to True if you want a public link
        debug=True
    )