In [1]:
# Simple Q&A Bot using LangChain and ChromaDB
# This notebook provides a streamlined interface for querying insurance documents

import chromadb
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import os
import json

# Load environment variables
load_dotenv()

print("🚀 Initializing Q&A Bot...")


🚀 Initializing Q&A Bot...


In [2]:
# Initialize Components
embedder = OpenAIEmbeddings(
    model="text-embedding-3-small", 
    api_key=os.getenv("OPENAI_API_KEY")
)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash", 
    temperature=0, 
    google_api_key=os.getenv("GOOGLE_API_KEY")
)

# Connect to existing ChromaDB
client = chromadb.PersistentClient(path="chromadb")
collections = [col.name for col in client.list_collections()]

print(f"📚 Available document collections: {collections}")


📚 Available document collections: ['TAL_AcceleratedProtection_2022-08-05', 'TALR7983-0923-accelerated-protection-pds-8-sep-2023']


In [5]:
# client.delete_collection(name="TALR7983-0923-accelerated-protection-pds-8-sep-2023_l") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible


In [3]:
query = "What happens if policyholder die while covered under the policy?"
embeddings = embedder.embed_query(query)


collection_2022 = client.get_collection(name="TALR7983-0923-accelerated-protection-pds-8-sep-2023")

collection_2022.query(query_embeddings=embeddings, n_results=10
                    )


{'ids': [['TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-15',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-82',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-171',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-81',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-156',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-338',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-126',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-203',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-104',
   'TALR7983-0923-accelerated-protection-pds-8-sep-2023l-chunk-86']],
 'embeddings': None,
 'documents': [['If the Policy is structured outside superannuation and you have validly nominated one or more beneficiaries to receive a benefit under Life Insurance, we will pay the benefit in accordance with your nomination. Otherwise, all payments made by us under the Policy will

In [18]:
vectorstore = Chroma(
    client=client,
    collection_name="TAL_AcceleratedProtection_2022-08-05",
    embedding_function=embedder
)


In [17]:
# search type = mmr 
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "lambda_mult": 0.8}
        )

results = retriever.invoke(query)

results

[Document(id='TAL_AcceleratedProtection_2022-08-05l-chunk-108', metadata={'product_type': 'Life', 'document_name': 'Accelerated Protection Combined Product Disclosure Statement and Policy Document', 'content_label': 'text', 'source_file': 'TAL_AcceleratedProtection_2022-08-05_chunks.jsonl', 'subheading': 'N/A', 'document_date': '2022-08-05', 'page_no': 39, 'chunk_id': '108', 'insurer': 'TAL', 'section_title': '2.3 Critical Illness Insurance'}, page_content="Critical Illness Insurance only applies if indicated in the Policy Schedule. Critical Illness Insurance is available as 'Standard' or Premier'. The type of Critical Illness Insurance and any applicable options is stated in your Policy Schedule. Critical Illness Insurance cannot be structured through superannuation.\nA benefit under Critical Illness Insurance will only be paid if the conditions and requirements for a claimable event are met after the Plan start date but before the Plan end date and the Life Insured suffers a specifie

In [20]:
# search type = similarity_search
# search type = mmr 
retriever = vectorstore.similarity_search_with_score(query, k=5)

retriever

[(Document(id='TAL_AcceleratedProtection_2022-08-05l-chunk-108', metadata={'product_type': 'Life', 'document_name': 'Accelerated Protection Combined Product Disclosure Statement and Policy Document', 'insurer': 'TAL', 'subheading': 'N/A', 'chunk_id': '108', 'content_label': 'text', 'page_no': 39, 'source_file': 'TAL_AcceleratedProtection_2022-08-05_chunks.jsonl', 'document_date': '2022-08-05', 'section_title': '2.3 Critical Illness Insurance'}, page_content="Critical Illness Insurance only applies if indicated in the Policy Schedule. Critical Illness Insurance is available as 'Standard' or Premier'. The type of Critical Illness Insurance and any applicable options is stated in your Policy Schedule. Critical Illness Insurance cannot be structured through superannuation.\nA benefit under Critical Illness Insurance will only be paid if the conditions and requirements for a claimable event are met after the Plan start date but before the Plan end date and the Life Insured suffers a specifi

In [36]:
"""Get similarity score for a specific chunk against a query or all other chunks."""

collection_2022 = client.get_collection(name="TAL_AcceleratedProtection_2022-08-05_l")

chunk_id = "TAL_AcceleratedProtection_2022-08-05_l-chunk-112"
target_query = "What are the critical illness conditions covered?"


# Get the specific chunk
chunk_data = collection_2022.get(ids=[chunk_id])
print(chunk_data)

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [6]:
class SimpleQABot:
    def __init__(self, client, embedder, llm, collections):
        self.client = client
        self.embedder = embedder
        self.llm = llm
        self.collections = collections
        
        # Create vectorstores for each collection
        self.vectorstores = {}
        for collection_name in collections:
            self.vectorstores[collection_name] = Chroma(
                client=client,
                collection_name=collection_name,
                embedding_function=embedder
            )
        
        # Define the prompt template
        self.prompt_template = ChatPromptTemplate.from_template("""
You are an expert insurance claims analyst. Use the provided document excerpts to answer the user's question accurately and comprehensively.

Document Sources:
{context}

Question: {question}

Instructions:
1. Provide a clear, accurate answer based on the document excerpts
2. Include specific references to source documents when possible
3. If information spans multiple documents, clearly distinguish between sources
4. If the answer cannot be found in the provided sources, state this clearly
5. Use bullet points for complex information when appropriate

Answer:
""")

    def search_documents(self, query, collection_name=None, k=10):
        """Search documents in one or all collections"""
        results = []
        
        if collection_name and collection_name in self.vectorstores:
            # Search specific collection
            retriever = self.vectorstores[collection_name].as_retriever(
                search_type="mmr",
                search_kwargs={"k": k, "lambda_mult": 0.8}
            )
            docs = retriever.invoke(query)
            for doc in docs:
                results.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "collection": collection_name
                })
        else:
            # Search all collections
            for coll_name, vectorstore in self.vectorstores.items():
                retriever = vectorstore.as_retriever(
                    search_type="mmr",
                    search_kwargs={"k": k//len(self.vectorstores) + 1, "lambda_mult": 0.8}
                )
                docs = retriever.invoke(query)
                for doc in docs:
                    results.append({
                        "content": doc.page_content,
                        "metadata": doc.metadata,
                        "collection": coll_name
                    })
        
        return results
    
    def format_context(self, search_results):
        """Format search results into context string"""
        context_parts = []
        for i, result in enumerate(search_results, 1):
            metadata = result["metadata"]
            context_part = f"""
Document {i} [Collection: {result["collection"]}]:
Source: {metadata.get("source_file", "Unknown")}
Page: {metadata.get("page_no", "Unknown")}
Section: {metadata.get("section_title", "Unknown")}
Content: {result["content"]}
---
"""
            context_parts.append(context_part)
        return "\n".join(context_parts)
    
    def print_documents(self, search_results):
        """Print the complete retrieved documents"""
        print("\n" + "="*80)
        print("📄 RETRIEVED DOCUMENTS")
        print("="*80)
        
        for i, result in enumerate(search_results, 1):
            metadata = result["metadata"]
            print(f"\n📑 Document {i}")
            print(f"Collection: {result['collection']}")
            print(f"Source File: {metadata.get('source_file', 'Unknown')}")
            print(f"Page: {metadata.get('page_no', 'Unknown')}")
            print(f"Section: {metadata.get('section_title', 'Unknown')}")
            if 'subheading' in metadata:
                print(f"Subheading: {metadata.get('subheading', 'Unknown')}")
            if 'bbox_left' in metadata:
                print(f"Position: ({metadata.get('bbox_left', '')}, {metadata.get('bbox_top', '')})")
            print(f"\nContent:")
            print("-" * 60)
            print(result["content"])
            print("-" * 60)
        
        print("\n" + "="*80)

    def ask(self, question, collection_name=None, k=10, show_documents=False):
        """Ask a question and get an answer"""
        print(f"🔍 Searching for: {question}")
        
        # Search documents
        search_results = self.search_documents(question, collection_name, k)
        
        if not search_results:
            return "No relevant documents found for your question."
        
        # Print documents if requested
        if show_documents:
            self.print_documents(search_results)
        
        # Format context
        context = self.format_context(search_results)
        
        # Generate answer
        chain = self.prompt_template | self.llm
        response = chain.invoke({
            "context": context,
            "question": question
        })
        
        return response.content

# Initialize the bot
qa_bot = SimpleQABot(client, embedder, llm, collections)
print("✅ Q&A Bot initialized successfully!")


✅ Q&A Bot initialized successfully!


In [4]:
# Example Questions to Try
example_questions = [
    "What are the critical illness conditions covered?",
    "What are the exclusions for life insurance?", 
    "How does TPD insurance work?",
    "What is the maximum benefit amount for life insurance?",
    "What are the premium types available?",
    "When does the policy end?",
    "What is the waiting period for income protection?",
    "What conditions are not covered?"
]

print("💡 Example questions you can ask:")
for i, question in enumerate(example_questions, 1):
    print(f"{i}. {question}")


💡 Example questions you can ask:
1. What are the critical illness conditions covered in TAL 2023?


In [8]:
# Test the Q&A Bot with a sample question
question = "What are the critical illness conditions covered in TAL 2023?"
answer = qa_bot.ask(question, show_documents = True)
print(f"\n❓ Question: {question}")
print(f"\n💬 Answer:\n{answer}")


🔍 Searching for: What are the critical illness conditions covered in TAL 2023?

📄 RETRIEVED DOCUMENTS

📑 Document 1
Collection: TAL_AcceleratedProtection_2022-08-05
Source File: TAL_AcceleratedProtection_2022-08-05_chunks.jsonl
Page: 9
Section: Introducing Accelerated Protection
Subheading: N/A

Content:
------------------------------------------------------------
Accelerated Protection.

Please read the PDS and the Policy Schedule carefully to ensure the terms and conditions meet your needs. These are important documents and should be kept in a safe place.

If the Policy is altered at any time you will receive a new Policy Schedule or confirmation reflecting the agreed changes.

Where cover being applied for with TAL is to replace existing cover with either TAL or another life insurance company, you must cancel the existing cover. No claim will be paid in respect of this Policy unless the previous cover has been cancelled. If the previous cover is not cancelled and a claim occurs, any

In [None]:
# 测试显示完整文档内容
print("🔍 Testing document retrieval with full document display:")
print("=" * 60)

question = "What are the exclusions for life insurance?"
answer = qa_bot.ask(question, show_documents=True, k=5)

print(f"\n❓ Question: {question}")
print(f"\n💬 Answer:\n{answer}")


In [None]:
# 只查看documents，不生成答案的功能
def show_documents_only(question, collection_name=None, k=10):
    """只显示检索到的文档，不生成AI答案"""
    print(f"🔍 Retrieving documents for: {question}")
    
    # Search documents
    search_results = qa_bot.search_documents(question, collection_name, k)
    
    if not search_results:
        print("No relevant documents found for your question.")
        return
    
    # Print all documents
    qa_bot.print_documents(search_results)
    print(f"\n📊 Found {len(search_results)} relevant documents")

# 测试只显示文档功能
print("\\n测试只显示文档功能：")
show_documents_only("critical illness insurance", k=3)


In [None]:
# Interactive Q&A Function
def interactive_qa():
    """Interactive Q&A session"""
    print("🤖 Welcome to the Insurance Q&A Bot!")
    print("Ask me anything about your insurance documents.")
    print("Type 'quit' to exit, 'examples' to see example questions.")
    print("You can specify a collection by typing 'collection:[name]' before your question.")
    print(f"Available collections: {collections}")
    print("-" * 50)
    
    while True:
        user_input = input("\n❓ Your question: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break
        elif user_input.lower() == 'examples':
            print("\n💡 Example questions:")
            for i, q in enumerate(example_questions, 1):
                print(f"{i}. {q}")
            continue
        elif not user_input:
            continue
        
        # Check if user specified a collection
        collection_name = None
        if user_input.startswith('collection:'):
            parts = user_input.split(':', 1)
            if len(parts) == 2:
                collection_name = parts[0].replace('collection', '').strip()
                user_input = parts[1].strip()
        
        try:
            answer = qa_bot.ask(user_input, collection_name)
            print(f"\n💬 Answer:\n{answer}")
        except Exception as e:
            print(f"❌ Error: {str(e)}")

# Uncomment the line below to start interactive mode
# interactive_qa()


In [None]:
# 使用说明和示例
print("""
📖 显示完整文档功能使用说明
=======================================

现在你可以通过以下几种方式查看完整的检索文档：

1. 在询问问题时显示文档：
   answer = qa_bot.ask("问题", show_documents=True)

2. 只显示文档，不生成AI答案：
   show_documents_only("问题")

3. 命令行使用：
   python simple_qa_bot.py -q "问题" --show-docs    # 显示文档+答案
   python simple_qa_bot.py -q "问题" --docs-only    # 只显示文档

4. 交互模式中：
   docs: 问题                    # 只显示文档
   with-docs: 问题              # 显示文档+答案

📄 显示的文档信息包括：
- Collection name (数据集名称)
- Source file (源文件)
- Page number (页码)
- Section title (章节标题)
- Subheading (子标题，如果有)
- Position (文档位置坐标，如果有)
- Complete content (完整内容)

💡 这样你就可以：
✓ 验证AI回答的准确性
✓ 查看更多上下文信息
✓ 了解信息来源的具体位置
✓ 手动检查检索质量
""")


In [None]:
# Batch Query Function - Test multiple questions at once
def batch_query(questions_list):
    """Run multiple queries and display results"""
    print("🔍 Running batch queries...")
    print("=" * 60)
    
    for i, question in enumerate(questions_list, 1):
        print(f"\n📋 Query {i}: {question}")
        try:
            answer = qa_bot.ask(question, k=5)  # Reduced k for faster processing
            print(f"💬 Answer: {answer[:500]}{'...' if len(answer) > 500 else ''}")
        except Exception as e:
            print(f"❌ Error: {str(e)}")
        print("-" * 40)

# Test with first 3 example questions
sample_questions = example_questions[:3]
batch_query(sample_questions)


In [None]:
# Advanced Features
class AdvancedQABot(SimpleQABot):
    """Extended Q&A Bot with additional features"""
    
    def analyze_collections(self):
        """Analyze the content of each collection"""
        print("📊 Collection Analysis:")
        print("=" * 50)
        
        for collection_name in self.collections:
            collection = self.client.get_collection(collection_name)
            count = collection.count()
            
            # Get a sample document to understand structure
            sample = collection.peek(limit=1)
            if sample['documents']:
                sample_metadata = sample['metadatas'][0] if sample['metadatas'] else {}
                print(f"\n📚 Collection: {collection_name}")
                print(f"   Documents: {count}")
                print(f"   Sample metadata keys: {list(sample_metadata.keys())}")
                
                # Show unique values for key metadata fields
                if 'source_file' in sample_metadata:
                    print(f"   Sample source: {sample_metadata.get('source_file', 'N/A')}")
        
        print("\n" + "=" * 50)
    
    def search_with_filters(self, query, source_file=None, page_range=None, k=10):
        """Search with metadata filters"""
        results = []
        
        for coll_name, vectorstore in self.vectorstores.items():
            # Build filter conditions
            filter_conditions = {}
            if source_file:
                filter_conditions["source_file"] = {"$eq": source_file}
            if page_range:
                filter_conditions["page_no"] = {
                    "$gte": page_range[0], 
                    "$lte": page_range[1]
                }
            
            # Use filtered search if filters are provided
            if filter_conditions:
                retriever = vectorstore.as_retriever(
                    search_type="mmr",
                    search_kwargs={
                        "k": k//len(self.vectorstores) + 1, 
                        "lambda_mult": 0.8,
                        "filter": filter_conditions
                    }
                )
            else:
                retriever = vectorstore.as_retriever(
                    search_type="mmr",
                    search_kwargs={"k": k//len(self.vectorstores) + 1, "lambda_mult": 0.8}
                )
            
            docs = retriever.invoke(query)
            for doc in docs:
                results.append({
                    "content": doc.page_content,
                    "metadata": doc.metadata,
                    "collection": coll_name
                })
        
        return results
    
    def ask_with_filters(self, question, source_file=None, page_range=None, k=10):
        """Ask a question with optional filters"""
        print(f"🔍 Searching: {question}")
        if source_file:
            print(f"   📄 Source filter: {source_file}")
        if page_range:
            print(f"   📖 Page range: {page_range[0]}-{page_range[1]}")
        
        # Search with filters
        search_results = self.search_with_filters(question, source_file, page_range, k)
        
        if not search_results:
            return "No relevant documents found with the specified filters."
        
        # Format context and generate answer
        context = self.format_context(search_results)
        chain = self.prompt_template | self.llm
        response = chain.invoke({
            "context": context,
            "question": question
        })
        
        return response.content

# Create advanced bot
advanced_bot = AdvancedQABot(client, embedder, llm, collections)
print("🔧 Advanced Q&A Bot created!")


In [None]:
# Analyze your collections
advanced_bot.analyze_collections()


In [None]:
# Quick Start Guide
print("""
🚀 QUICK START GUIDE
==================

1. Basic Usage:
   answer = qa_bot.ask("Your question here")

2. Search specific collection:
   answer = qa_bot.ask("Question", collection_name="your_collection")

3. Interactive mode:
   interactive_qa()

4. Advanced filtering:
   answer = advanced_bot.ask_with_filters(
       "Question", 
       source_file="specific_file.pdf",
       page_range=(10, 20)
   )

5. Batch processing:
   batch_query(["Question 1", "Question 2", "Question 3"])

6. Analyze collections:
   advanced_bot.analyze_collections()

📝 Example Questions:
- "What are the exclusions for life insurance?"
- "How does the claims process work?"
- "What is the maximum benefit amount?"
- "What waiting periods apply?"

💡 Pro Tips:
- Be specific in your questions for better results
- Use natural language - the bot understands context
- Try different phrasings if you don't get the expected result
- Check available collections first: print(collections)
""")
