In [1]:
# Install required packages
%pip install langchain
%pip install langchain_community
%pip install unstructured
%pip install langchain_openai
%pip install langchain_groq
%pip install langchain_pinecone
%pip install python-magic-bin
%pip install python-dotenv
%pip install rank_bm25

import os
import json
import tiktoken
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_groq import ChatGroq
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
load_dotenv()

# Initialize tokenizer to count tokens
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens in text using tiktoken"""
    return len(tokenizer.encode(text))

In [3]:
# ==========================================
# STEP 1: Load Documents
# ==========================================

# Load all text files from directory
dir_loader = DirectoryLoader(
    "Ordinance",
    glob="**/*.txt",  
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True
)

documents = dir_loader.load() 

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")
    print(f"  Tokens: {count_tokens(doc.page_content)}")

  0%|          | 0/11 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:06<00:00,  1.77it/s]

Loaded 11 documents

Document 1:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_1.txt
  Length: 18522 characters
  Tokens: 19893

Document 2:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_10.txt
  Length: 18021 characters
  Tokens: 20418

Document 3:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_11.txt
  Length: 23953 characters
  Tokens: 24708

Document 4:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_2.txt
  Length: 16232 characters
  Tokens: 12252

Document 5:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_3.txt
  Length: 20885 characters
  Tokens: 16047

Document 6:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_4.txt
  Length: 21150 characters
  Tokens: 16123

Document 7:
  Source: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_5.txt
  Length: 17769 characters
  Tokens: 16421

Document 8:
  Source: Ordinance\08. Finance Ordinance, 2025 




In [4]:
# ==========================================
# STEP 2: Setup OpenAI Embeddings & LLM for Chunking
# ==========================================

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.getenv("OPENAI_API_KEY")
)

# Initialize LLM for chunking (using OpenAI for better instruction following)
chunking_llm = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="gpt-4.1",  # Use GPT-4 for better chunking quality
    temperature=0.5,
    max_tokens=None
)

print("OpenAI Embedding Model and Chunking LLM loaded successfully!")

# Test embeddings
query_result = embeddings.embed_query("Hello world")
print("Embedding dimension:", len(query_result))

OpenAI Embedding Model and Chunking LLM loaded successfully!
Embedding dimension: 3072


In [5]:
# ==========================================
# STEP 3: LLM-Based Intelligent Chunking (FIXED)
# ==========================================

# Create the system prompt for LLM-based chunking (FIXED - Escaped curly braces)
chunking_system_prompt = """
You are a Legal Document Structuring Agent for Bangladeshi laws. 
Your task is to parse and chunk the Finance Ordinance, 2025 (and its subsequent Amendments), 
producing metadata-rich structured outputs for a Retrieval-Augmented Generation (RAG) system.

STRICT INSTRUCTIONS:
- Maintain ZERO LOSS POLICY: full original text must be preserved in `page_content`.
- Chunk at the lowest stable unit: Section / Subsection / Clause. 
- Schedules and HS Code Tables must be captured as **single chunks per table**, not row by row.
- For Amendments: 
  - Always include metadata field `"amends": "<target section/schedule/table>"`.
  - Preserve `"version": "amendment"` and link it to `"parent_version": "Finance Ordinance, 2025 (02 June 2025)"`.

OUTPUT FORMAT (per chunk):
    {{
    "chunks": [
        {{
        "content":"The full text content of the chunk including relevant headers",
        "metadata": {{
        "doc_type": "ORDINANCE",
        "span_unit": "section",
        "span_title": "‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶∂‡¶ø‡¶∞‡ßã‡¶®‡¶æ‡¶Æ ‡¶ì ‡¶™‡ßç‡¶∞‡¶¨‡¶∞‡ßç‡¶§‡¶®",
        "ordinance_name": "Finance Ordinance, 2025",
        "ordinance_year": "2025",
        "language": "bn+en",
        "is_amendment": false,
        "ordinance_numbers": ["‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¶‡ßá‡¶∂ ‡¶®‡¶Ç ‡ß®‡ßÆ, ‡ß®‡ß¶‡ß®‡ß´"],
        "effective_date": "2025-07-01",
        "law_refs": ["VAT Act, 2012 (Law No. 47 of 2012)", "Customs Act, 2023"],
        "section_refs": ["‡¶ß‡¶æ‡¶∞‡¶æ ‡ßß", "Section 1"],
        "schedule_refs": [],
        "span_has_table": false,
        "table_ids": [],
        "table_linked_section_numbers": [],
        "hs_headings": [],
        "hs_codes": [],
        "keywords": ["‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶∂‡¶ø‡¶∞‡ßã‡¶®‡¶æ‡¶Æ", "‡¶™‡ßç‡¶∞‡¶¨‡¶∞‡ßç‡¶§‡¶®", "effective date", "‡ßß ‡¶ú‡ßÅ‡¶≤‡¶æ‡¶á ‡ß®‡ß¶‡ß®‡ß´", "Ordinance"]
        }}
        
      }}
    ]
}}
ADDITIONAL RULES:
- Capture both Bangla and English keywords (e.g., ‚Äú‡¶Æ‡ßÇ‡¶≤‡ßç‡¶Ø ‡¶∏‡¶Ç‡¶Ø‡ßã‡¶ú‡¶® ‡¶ï‡¶∞‚Äù, ‚ÄúValue Added Tax‚Äù, ‚ÄúSupplementary Duty‚Äù, ‚ÄúHS Code‚Äù, ‚ÄúERP software‚Äù).
- Ensure cross-references (e.g., Customs Act 2023, VAT Act 2012) are stored in `"keywords_en"` and `"keywords_bn"`.
- Always keep `schedule/table` chunks intact, never split rows.
- Preserve legal hierarchy faithfully.
- All amendments must clearly indicate the target ordinance, section, or schedule.

"""


def llm_chunk_document(document: Document, max_retries: int = 2) -> List[Document]:
    """
    Use LLM to intelligently chunk a legal document
    """
    print(f"\nProcessing document: {document.metadata.get('source', 'Unknown')}")
    
    # Create the prompt
    prompt = ChatPromptTemplate.from_messages([
        ("system", chunking_system_prompt),
        ("human", "Document to chunk:\n\n{document_text}")
    ])
    
    # Chain LLM with prompt
    chunking_chain = prompt | chunking_llm
    
    for attempt in range(max_retries + 1):
        try:
            print(f"  Attempt {attempt + 1} - Sending to LLM for chunking...")
            
            # Get LLM response
            response = chunking_chain.invoke({
                "document_text": document.page_content
            })
            
            # Parse JSON response
            response_text = response.content.strip()
            
            # Clean up the response (remove markdown formatting if present)
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            
            # Parse JSON
            chunks_data = json.loads(response_text)
            
            # Create Document objects
            chunk_documents = []
            for i, chunk_info in enumerate(chunks_data.get("chunks", [])):
                # Validate chunk size
                chunk_content = chunk_info.get("content", "")
                chunk_tokens = count_tokens(chunk_content)
                
                if chunk_tokens > 3000:
                    print(f"    Warning: Chunk {i+1} is {chunk_tokens} tokens (>3000)")
                
                # Create metadata
                chunk_metadata = document.metadata.copy()
                chunk_metadata.update(chunk_info.get("metadata", {}))
                chunk_metadata["chunk_index"] = i
                chunk_metadata["total_chunks"] = len(chunks_data.get("chunks", []))
                chunk_metadata["chunk_tokens"] = chunk_tokens
                
                # Create Document
                chunk_doc = Document(
                    page_content=chunk_content,
                    metadata=chunk_metadata
                )
                chunk_documents.append(chunk_doc)
            
            print(f"  ‚úÖ Successfully created {len(chunk_documents)} chunks")
            
            # Print chunk statistics
            for i, chunk in enumerate(chunk_documents):
                tokens = chunk.metadata.get("chunk_tokens", 0)
                chunk_type = chunk.metadata.get("chunk_type", "unknown")
                print(f"    Chunk {i+1}: {tokens} tokens, type: {chunk_type}")
            
            return chunk_documents
            
        except json.JSONDecodeError as e:
            print(f"    ‚ùå JSON parsing error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"    ‚ùå All attempts failed - skipping document")
                return []
            
        except Exception as e:
            print(f"    ‚ùå Error on attempt {attempt + 1}: {e}")
            if attempt == max_retries:
                print(f"    ‚ùå All attempts failed - skipping document")
                return []
    
    return []

def process_all_documents_with_llm(documents: List[Document]) -> List[Document]:
    """
    Process all documents using LLM-based chunking
    """
    all_chunks = []
    
    print(f"\nüöÄ Starting LLM-based chunking for {len(documents)} documents...")
    
    for i, doc in enumerate(documents):
        print(f"\n--- Processing Document {i+1}/{len(documents)} ---")
        
        # Check document size
        doc_tokens = count_tokens(doc.page_content)
        print(f"Document tokens: {doc_tokens}")
        
        if doc_tokens < 100:
            print("  ‚ö†Ô∏è  Document too small, skipping...")
            continue
            
        # Process with LLM
        doc_chunks = llm_chunk_document(doc)
        all_chunks.extend(doc_chunks)
        
        print(f"  üìä Total chunks so far: {len(all_chunks)}")
    
    return all_chunks

# Process documents with LLM-based chunking
print("\nü§ñ Starting LLM-based intelligent chunking...")
chunks = process_all_documents_with_llm(documents)

print(f"\n‚úÖ LLM Chunking Complete!")
print(f"üìä Total chunks created: {len(chunks)}")
print(f"üìù Sample chunk metadata: {chunks[0].metadata if chunks else 'No chunks'}")


ü§ñ Starting LLM-based intelligent chunking...

üöÄ Starting LLM-based chunking for 11 documents...

--- Processing Document 1/11 ---
Document tokens: 19893

Processing document: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_1.txt
  Attempt 1 - Sending to LLM for chunking...
  ‚úÖ Successfully created 3 chunks
    Chunk 1: 463 tokens, type: unknown
    Chunk 2: 1997 tokens, type: unknown
    Chunk 3: 1924 tokens, type: unknown
  üìä Total chunks so far: 3

--- Processing Document 2/11 ---
Document tokens: 20418

Processing document: Ordinance\08. Finance Ordinance, 2025 (02 June 2025)_split_10.txt
  Attempt 1 - Sending to LLM for chunking...
  ‚úÖ Successfully created 5 chunks
    Chunk 1: 908 tokens, type: unknown
    Chunk 2: 759 tokens, type: unknown
    Chunk 3: 273 tokens, type: unknown
    Chunk 4: 3656 tokens, type: unknown
    Chunk 5: 1764 tokens, type: unknown
  üìä Total chunks so far: 8

--- Processing Document 3/11 ---
Document tokens: 24708

Processing d

In [6]:
print(len(chunks))

75


In [7]:
for i in range(len(chunks)):
    print("===============CHUNK===============",i)
    print(chunks[i])

page_content='‡ßß‡•§ ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶∂‡¶ø‡¶∞‡ßã‡¶®‡¶æ‡¶Æ ‡¶ì ‡¶™‡ßç‡¶∞‡¶¨‡¶∞‡ßç‡¶§‡¶®‡•§‚Äî(‡ßß) ‡¶è‡¶á ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ö‡¶∞‡ßç‡¶• ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¶‡ßá‡¶∂, ‡ß®‡ß¶‡ß®‡ß´ ‡¶®‡¶æ‡¶Æ‡ßá ‡¶Ö‡¶≠‡¶ø‡¶π‡¶ø‡¶§ ‡¶π‡¶á‡¶¨‡ßá‡•§

(‡ß®) ‡¶è‡¶á ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ß≠, ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ßÆ ‡¶è‡¶∞ ‡¶¶‡¶´‡¶æ (‡¶ï), (‡¶ñ), (‡¶ó) ‡¶ì (‡¶ö), ‡¶ß‡¶æ‡¶∞‡¶æ ‡ßß‡ß©‡ßÆ, ‡ßß‡ß©‡ßØ, ‡ßß‡ß™‡ß¶, ‡ßß‡ß™‡ßß, ‡ßß‡ß™‡ß®, ‡ßß‡ß™‡ß©, ‡ßß‡ß™‡ß™, ‡ßß‡ß™‡ß´, ‡ßß‡ß™‡ß¨, ‡ßß‡ß™‡ß≠, ‡ßß‡ß™‡ßÆ, ‡ßß‡ß™‡ßØ, ‡ßß‡ß´‡ß¶, ‡ßß‡ß´‡ßß, ‡ßß‡ß´‡ß®, ‡ßß‡ß´‡ß©, ‡ßß‡ß´‡ß™, ‡ßß‡ß´‡ß´, ‡ßß‡ß´‡ß¨, ‡ßß‡ß´‡ß≠, ‡ßß‡ß´‡ßÆ ‡¶ì ‡ßß‡ß´‡ßØ ‡¶Ö‡¶¨‡¶ø‡¶≤‡¶Æ‡ßç‡¶¨‡ßá ‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶ï‡¶∞ ‡¶π‡¶á‡¶¨‡ßá ‡¶è‡¶¨‡¶Ç ‡¶Ö‡¶®‡ßç‡¶Ø‡¶æ‡¶®‡ßç‡¶Ø ‡¶ß‡¶æ‡¶∞‡¶æ‡¶∏‡¶Æ‡ßÇ‡¶π ‡ßß ‡¶ú‡ßÅ‡¶≤‡¶æ‡¶á, ‡ß®‡ß¶‡ß®‡ß´ ‡¶§‡¶æ‡¶∞‡¶ø‡¶ñ ‡¶π‡¶á‡¶§‡ßá ‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶ï‡¶∞ ‡¶π‡¶á‡¶¨‡ßá‡•§' metadata={'source': 'Ordinance\\08. Finance Ordinance, 2025 (02 June 2025)_split_1.txt', 'doc_type': 'ORDINANCE', 'span_unit': 'section', 'span_

In [None]:
# ==========================================
# STEP 4: Setup Pinecone
# ==========================================

# Set Pinecone API key
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Check embedding dimension
test_embedding = embeddings.embed_query("test")
actual_dimension = len(test_embedding)
print(f"Actual embedding dimension: {actual_dimension}")

# Index settings
index_name = "ordinance-agentic-chunking"
embedding_dimension = 3072  # text-embedding-3-large dimension

# Create index if it doesn't exist
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Created new index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

# Create vectorstore
vectorstore = PineconeVectorStore(
    index=pc.Index(index_name),
    embedding=embeddings
)

Actual embedding dimension: 3072
Using existing index: rules-agentic-chunking


In [11]:
# ==========================================
# STEP 5: Add Chunks to Vectorstore (FIXED)
# ==========================================

def sanitize_metadata_for_pinecone(metadata: dict) -> dict:
    """
    Sanitize metadata to comply with Pinecone requirements:
    - No null/None values
    - Only strings, numbers, booleans, or lists of strings
    """
    sanitized = {}
    
    for key, value in metadata.items():
        if value is None:
            # Skip null/None values entirely
            continue
        elif isinstance(value, str):
            # Keep non-empty strings
            if value.strip():
                sanitized[key] = value.strip()
        elif isinstance(value, (int, float, bool)):
            # Keep numbers and booleans
            sanitized[key] = value
        elif isinstance(value, list):
            # Clean lists - only keep non-empty strings
            clean_list = [str(item).strip() for item in value if item is not None and str(item).strip()]
            if clean_list:
                sanitized[key] = clean_list
        elif isinstance(value, dict):
            # Skip complex nested objects
            continue
        else:
            # Convert other types to strings
            str_value = str(value).strip()
            if str_value and str_value.lower() not in ['none', 'null', '']:
                sanitized[key] = str_value
    
    # Ensure we have at least basic metadata
    if 'source' not in sanitized:
        sanitized['source'] = 'unknown'
    if 'chunk_type' not in sanitized:
        sanitized['chunk_type'] = 'general'
    
    return sanitized

def add_chunks_to_vectorstore_fixed(vectorstore, chunks, max_tokens_per_batch=200000):
    """Add LLM-chunked documents to vectorstore - NEVER SKIP ANY CHUNKS"""
    
    if not chunks:
        print("No chunks to add!")
        return
    
    print(f"üì§ Adding {len(chunks)} LLM-generated chunks to vectorstore...")
    print("üßπ Sanitizing metadata for Pinecone compatibility...")
    print("üîí ZERO LOSS POLICY: Every chunk will be uploaded with fixed metadata")
    
    # Pre-process all chunks to sanitize metadata - NEVER SKIP
    sanitized_chunks = []
    
    for i, chunk in enumerate(chunks):
        
            # Sanitize metadata - replace nulls with defaults
            clean_metadata = sanitize_metadata_for_pinecone(chunk.metadata)
            
            # Ensure content exists
            content = chunk.page_content if chunk.page_content else "Content not available"
            
            # Create new Document with clean metadata
            clean_chunk = Document(
                page_content=content,
                metadata=clean_metadata
            )
            sanitized_chunks.append(clean_chunk)
            
    print(f"  ‚úÖ Prepared {len(sanitized_chunks)} chunks for upload (same as input: {len(chunks)})")
    
    # Verify we haven't lost any chunks
    if len(sanitized_chunks) != len(chunks):
        raise Exception(f"CRITICAL ERROR: Chunk count mismatch! Input: {len(chunks)}, Output: {len(sanitized_chunks)}")
    
    # Now proceed with batch upload - with aggressive retry logic
    current_batch = []
    current_tokens = 0
    batch_num = 1
    successful_uploads = 0
    
    for i, chunk in enumerate(sanitized_chunks):
        chunk_tokens = chunk.metadata.get("chunk_tokens", count_tokens(chunk.page_content))
        
        # Check if adding this chunk would exceed the limit
        if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:
            # Process current batch
            print(f"Processing batch {batch_num}: {len(current_batch)} chunks, {current_tokens} tokens")
            
            success = upload_batch_with_retry(vectorstore, current_batch, batch_num)
            successful_uploads += success
            
            # Reset for next batch
            current_batch = []
            current_tokens = 0
            batch_num += 1
        
        # Add chunk to current batch
        current_batch.append(chunk)
        current_tokens += chunk_tokens
        
        if (i + 1) % 20 == 0:
            print(f"  üìä Processed {i + 1}/{len(sanitized_chunks)} chunks...")
    
    # Process final batch
    if current_batch:
        print(f"Processing final batch {batch_num}: {len(current_batch)} chunks, {current_tokens} tokens")
        success = upload_batch_with_retry(vectorstore, current_batch, batch_num)
        successful_uploads += success
    
    print(f"üéâ Upload complete! Successfully added {successful_uploads}/{len(chunks)} chunks to vectorstore!")
    
    if successful_uploads != len(chunks):
        raise Exception(f"CRITICAL ERROR: Not all chunks uploaded! Expected: {len(chunks)}, Uploaded: {successful_uploads}")

def upload_batch_with_retry(vectorstore, batch, batch_num):
    """Upload batch with aggressive retry - ensure every chunk gets uploaded"""
    
    try:
        vectorstore.add_documents(batch)
        print(f"  ‚úÖ Batch {batch_num} successful ({len(batch)} chunks)")
        return len(batch)
        
    except Exception as e:
        print(f"  ‚ùå Batch {batch_num} failed: {e}")
        print(f"  üîÑ Switching to individual upload mode for {len(batch)} chunks...")
        
        successful_individual = 0
        
        for j, single_chunk in enumerate(batch):
            try:
                vectorstore.add_documents([single_chunk])
                successful_individual += 1
                
            except Exception as single_error:
                print(f"    ‚ùå Individual chunk {j+1} failed: {single_error}")
                
                # Last resort - strip metadata to absolute minimum
                try:
                    minimal_chunk = Document(
                        page_content=single_chunk.page_content,
                        metadata={
                            'source': f'emergency_chunk_{batch_num}_{j}',
                            'chunk_type': 'general'
                        }
                    )
                    vectorstore.add_documents([minimal_chunk])
                    successful_individual += 1
                    print(f"    üÜò Emergency upload successful for chunk {j+1}")
                    
                except Exception as emergency_error:
                    print(f"    üí• CRITICAL: Cannot upload chunk {j+1} even with minimal metadata: {emergency_error}")
                    print(f"    üìù Content preview: {single_chunk.page_content[:100]}...")
                    # This should never happen, but we log it for investigation
        
        print(f"  üìä Individual upload result: {successful_individual}/{len(batch)} chunks")
        return successful_individual

# Debug function to check your current chunks
def debug_chunk_metadata(chunks, num_samples=5):
    """Debug function to inspect chunk metadata"""
    print(f"üîç Debugging metadata for {min(num_samples, len(chunks))} sample chunks:")
    
    for i, chunk in enumerate(chunks[:num_samples]):
        print(f"\nChunk {i+1} metadata:")
        for key, value in chunk.metadata.items():
            value_type = type(value).__name__
            print(f"  {key}: {value} (type: {value_type})")
            
            if value is None:
                print(f"    ‚ùå NULL VALUE DETECTED in '{key}' - this will cause Pinecone error!")



In [12]:
# Run this first to see what's wrong
print("üîç Checking your chunks for metadata issues...")
debug_chunk_metadata(chunks)



üîç Checking your chunks for metadata issues...
üîç Debugging metadata for 5 sample chunks:

Chunk 1 metadata:
  source: r\04. Income Tax Alternative Dispute Resolution Rules, 2024_complete_transcription.txt (type: str)
  doc_type: RULES (type: str)
  span_unit: ['header'] (type: list)
  span_title: ‡¶™‡ßç‡¶∞‡¶æ‡¶∞‡¶Æ‡ßç‡¶≠‡¶ø‡¶ï ‡¶§‡¶•‡ßç‡¶Ø ‡¶ì ‡¶∂‡¶ø‡¶∞‡ßã‡¶®‡¶æ‡¶Æ (type: str)
  rules_name: Income Tax Alternative Dispute Resolution Rules, 2024 (type: str)
  rules_year: 2024 (type: str)
  language: bn (type: str)
  is_amendment: False (type: bool)
  keywords: ['‡¶™‡ßç‡¶∞‡¶ú‡ßç‡¶û‡¶æ‡¶™‡¶®', '‡¶ú‡¶æ‡¶§‡ßÄ‡¶Ø‡¶º ‡¶∞‡¶æ‡¶ú‡¶∏‡ßç‡¶¨ ‡¶¨‡ßã‡¶∞‡ßç‡¶°', '‡¶∂‡¶ø‡¶∞‡ßã‡¶®‡¶æ‡¶Æ', '‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶ï‡¶∞', '‡¶ß‡¶æ‡¶∞‡¶æ ‡ß©‡ß™‡ß©', 'header', 'SRO 243', 'alternative dispute resolution'] (type: list)
  chunk_index: 0 (type: int)
  total_chunks: 27 (type: int)
  chunk_tokens: 674 (type: int)

Chunk 2 metadata:
  source: r\04. Income Tax Alternative Dispute Resolution Rules, 2024_complete_transcr

In [13]:
# Then use the fixed function
add_chunks_to_vectorstore_fixed(vectorstore, chunks)

üì§ Adding 27 LLM-generated chunks to vectorstore...
üßπ Sanitizing metadata for Pinecone compatibility...
üîí ZERO LOSS POLICY: Every chunk will be uploaded with fixed metadata
  ‚úÖ Prepared 27 chunks for upload (same as input: 27)
  üìä Processed 20/27 chunks...
Processing final batch 1: 27 chunks, 43095 tokens
  ‚úÖ Batch 1 successful (27 chunks)
üéâ Upload complete! Successfully added 27/27 chunks to vectorstore!


In [None]:
# # ==========================================
# # STEP 5: Add Chunks to Vectorstore
# # ==========================================

# def add_chunks_to_vectorstore(vectorstore, chunks, max_tokens_per_batch=200000):
#     """Add LLM-chunked documents to vectorstore with enhanced metadata"""
    
#     if not chunks:
#         print("No chunks to add!")
#         return
    
#     current_batch = []
#     current_tokens = 0
#     batch_num = 1
    
#     print(f"üì§ Adding {len(chunks)} LLM-generated chunks to vectorstore...")
    
#     for i, chunk in enumerate(chunks):
#         chunk_tokens = chunk.metadata.get("chunk_tokens", count_tokens(chunk.page_content))
        
#         # Check if adding this chunk would exceed the limit
#         if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:
#             # Process current batch
#             print(f"Processing batch {batch_num}: {len(current_batch)} chunks, {current_tokens} tokens")
            
#             try:
#                 vectorstore.add_documents(current_batch)
#                 print(f"  ‚úÖ Batch {batch_num} successful")
#             except Exception as e:
#                 print(f"  ‚ùå Batch {batch_num} failed: {e}")
#                 # Try individual chunks
#                 for single_chunk in current_batch:
#                     try:
#                         vectorstore.add_documents([single_chunk])
#                     except Exception as single_error:
#                         print(f"    ‚ùå Single chunk failed: {single_error}")
            
#             # Reset for next batch
#             current_batch = []
#             current_tokens = 0
#             batch_num += 1
        
#         # Add chunk to current batch
#         current_batch.append(chunk)
#         current_tokens += chunk_tokens
        
#         if (i + 1) % 20 == 0:
#             print(f"  üìä Processed {i + 1}/{len(chunks)} chunks...")
    
#     # Process final batch
#     if current_batch:
#         print(f"Processing final batch {batch_num}: {len(current_batch)} chunks, {current_tokens} tokens")
#         try:
#             vectorstore.add_documents(current_batch)
#             print(f"  ‚úÖ Final batch successful")
#         except Exception as e:
#             print(f"  ‚ùå Final batch failed: {e}")
#             # Try individual chunks
#             for single_chunk in current_batch:
#                 try:
#                     vectorstore.add_documents([single_chunk])
#                 except Exception as single_error:
#                     print(f"    ‚ùå Single chunk failed: {single_error}")
    
#     print("üéâ All LLM chunks processed and added to vectorstore!")

# # Add chunks to vectorstore
# add_chunks_to_vectorstore(vectorstore, chunks)

In [17]:
# ==========================================
# STEP 6: Setup Retrieval Chain
# ==========================================

# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity", 
    search_kwargs={'k': 15}
)

# Initialize final LLM for answering questions
# answering_llm = ChatGroq(
#     groq_api_key=os.getenv("GROQ_API_KEY"),
#     model_name="meta-llama/llama-4-scout-17b-16e-instruct",
#     temperature=0.1,
#     max_tokens=None
# )

# Initialize Groq LLM (you can also use OpenAI)
import os
from langchain_openai import ChatOpenAI
answering_llm = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="gpt-4.1-mini",
    temperature=0.7,
    max_tokens=None
)

In [18]:


# Enhanced system prompt for the final RAG chain
enhanced_system_prompt = (
    "‡¶Ü‡¶™‡¶®‡¶ø ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶Ü‡¶á‡¶®‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø‡¶ï ‡¶è‡¶ï‡¶ü‡¶ø ‡¶â‡¶®‡ßç‡¶®‡¶§ ‡¶≤‡¶ø‡¶ó‡ßç‡¶Ø‡¶æ‡¶≤ ‡¶ö‡ßç‡¶Ø‡¶æ‡¶ü‡¶¨‡¶ü‡•§ ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶ú‡ßç‡¶û‡¶æ‡¶®‡¶≠‡¶æ‡¶®‡ßç‡¶°‡¶æ‡¶∞ LLM-‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø‡¶ï ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü ‡¶ö‡¶æ‡¶ô‡ßç‡¶ï‡¶ø‡¶Ç ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá ‡¶™‡ßç‡¶∞‡¶∏‡ßç‡¶§‡ßÅ‡¶§, "
    "‡¶Ø‡¶æ ‡¶Ü‡¶á‡¶®‡¶ø ‡¶ï‡¶æ‡¶†‡¶æ‡¶Æ‡ßã ‡¶è‡¶¨‡¶Ç ‡¶π‡¶æ‡¶Ø‡¶º‡¶æ‡¶∞‡¶æ‡¶∞‡ßç‡¶ï‡¶ø ‡¶¨‡¶ú‡¶æ‡¶Ø‡¶º ‡¶∞‡ßá‡¶ñ‡ßá ‡¶∏‡¶Ç‡¶ó‡¶†‡¶ø‡¶§‡•§ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶™‡¶æ‡¶¨‡ßá‡¶®:\n\n"
    "**‡¶®‡¶•‡¶ø ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∞**: ‡¶Ü‡¶á‡¶®/‡¶Ö‡ßç‡¶Ø‡¶æ‡¶ï‡ßç‡¶ü, ‡¶¨‡¶ø‡¶ß‡¶ø‡¶Æ‡¶æ‡¶≤‡¶æ, ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¶‡ßá‡¶∂, ‡¶∏‡¶Ç‡¶∂‡ßã‡¶ß‡¶®‡ßÄ, ‡¶™‡ßç‡¶∞‡¶ú‡ßç‡¶û‡¶æ‡¶™‡¶®, ‡¶∏‡¶æ‡¶∞‡ßç‡¶ï‡ßÅ‡¶≤‡¶æ‡¶∞, SRO/GO/RO\n"
    "**‡¶â‡¶®‡ßç‡¶®‡¶§ ‡¶Æ‡ßá‡¶ü‡¶æ‡¶°‡ßá‡¶ü‡¶æ**: ‡¶™‡ßç‡¶∞‡¶§‡¶ø‡¶ü‡¶ø ‡¶ö‡¶æ‡¶ô‡ßç‡¶ï‡ßá ‡¶Ü‡¶á‡¶®‡ßá‡¶∞ ‡¶®‡¶æ‡¶Æ, ‡¶ß‡¶æ‡¶∞‡¶æ ‡¶™‡¶∞‡¶ø‡¶∏‡ßÄ‡¶Æ‡¶æ, ‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶Ø‡¶º, ‡¶Æ‡ßÇ‡¶≤ ‡¶∂‡¶¨‡ßç‡¶¶, ‡¶§‡¶æ‡¶∞‡¶ø‡¶ñ ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá\n\n"
    "**‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡ßá‡¶∂‡¶®‡¶æ**:\n"
    "1) **‡¶∏‡ßÇ‡¶§‡ßç‡¶∞ ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡ßá‡¶∂‡¶®‡¶æ**: ‡¶Æ‡ßá‡¶ü‡¶æ‡¶°‡ßá‡¶ü‡¶æ ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§ ‡¶§‡¶•‡ßç‡¶Ø ‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá ‡¶∏‡ßÅ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶∞‡ßá‡¶´‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶∏ ‡¶¶‡¶ø‡¶®\n"
    "2) **‡¶ï‡ßç‡¶∞‡¶∏-‡¶∞‡ßá‡¶´‡¶æ‡¶∞‡ßá‡¶®‡ßç‡¶∏**: ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡¶ø‡¶§ ‡¶ß‡¶æ‡¶∞‡¶æ/‡¶¨‡¶ø‡¶ß‡¶æ‡¶® ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶ï‡¶∞‡ßÅ‡¶® ‡¶Ø‡¶¶‡¶ø ‡¶™‡ßç‡¶∞‡¶æ‡¶∏‡¶ô‡ßç‡¶ó‡¶ø‡¶ï ‡¶π‡¶Ø‡¶º\n"
    "3) **‡¶ï‡¶æ‡¶†‡¶æ‡¶Æ‡ßã‡¶ó‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞**: (‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞ (‡¶ñ) ‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø (‡¶ó) ‡¶¨‡¶ø‡¶∏‡ßç‡¶§‡¶æ‡¶∞‡¶ø‡¶§ ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ (‡¶ò) ‡¶™‡ßç‡¶∞‡¶Ø‡¶º‡ßã‡¶ó/‡¶∏‡¶§‡¶∞‡ßç‡¶ï‡¶§‡¶æ\n"
    "4) **‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü ‡¶Ö‡¶®‡ßÅ‡¶∏‡¶®‡ßç‡¶ß‡¶æ‡¶®**: ‡¶ö‡¶æ‡¶ô‡ßç‡¶ï ‡¶Æ‡ßá‡¶ü‡¶æ‡¶°‡ßá‡¶ü‡¶æ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá ‡¶™‡ßç‡¶∞‡¶æ‡¶∏‡¶ô‡ßç‡¶ó‡¶ø‡¶ï ‡¶§‡¶•‡ßç‡¶Ø ‡¶ñ‡ßÅ‡¶Å‡¶ú‡ßÅ‡¶®\n"
    "5) **‡¶≠‡¶æ‡¶∑‡¶æ**: ‡¶™‡ßç‡¶∞‡¶æ‡¶•‡¶Æ‡¶ø‡¶ï‡¶≠‡¶æ‡¶¨‡ßá ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶Ø‡¶º, ‡¶∂‡ßá‡¶∑‡ßá ‡¶á‡¶Ç‡¶∞‡ßá‡¶ú‡¶ø ‡¶∏‡¶æ‡¶∞‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡ßá‡¶™\n\n"
    "‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§ ‡¶∏‡ßç‡¶Æ‡¶æ‡¶∞‡ßç‡¶ü ‡¶ö‡¶æ‡¶ô‡ßç‡¶ï ‡¶ï‡¶®‡¶ü‡ßá‡¶ï‡ßç‡¶∏‡¶ü:\n{context}\n\n"
    "---\n"
    "You are an advanced Bangladesh Legal Assistant with LLM-enhanced chunking. Each context chunk contains "
    "intelligent metadata including act names, section ranges, keywords, and legal hierarchy. Use this enhanced "
    "Be specific according to the law.Don't give any information out of the context.If the related answer is directly present directly mention that with lease amount of modification"
)

# Create enhanced prompt
enhanced_prompt = ChatPromptTemplate.from_messages([
    ("system", enhanced_system_prompt),
    ("human", "{input}"),
])

# Create chains
question_answer_chain = create_stuff_documents_chain(answering_llm, enhanced_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

print("üîó Enhanced RAG chains ready with LLM-chunked legal documents!")

üîó Enhanced RAG chains ready with LLM-chunked legal documents!


In [19]:
response = rag_chain.invoke({"input":"‡¶â‡ßé‡¶∏‡ßá ‡¶ï‡¶∞ ‡¶¨‡¶ø‡¶ß‡¶ø‡¶Æ‡¶æ‡¶≤‡¶æ, ‡ß®‡ß¶‡ß®‡ß™ ‡¶®‡¶ø‡ßü‡ßá ‡¶ú‡¶æ‡¶®‡¶§‡ßá ‡¶ö‡¶æ‡¶á?" })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:  
‡¶â‡ßé‡¶∏‡ßá ‡¶ï‡¶∞ ‡¶¨‡¶ø‡¶ß‡¶ø‡¶Æ‡¶æ‡¶≤‡¶æ, ‡ß®‡ß¶‡ß®‡ß™ ‡¶π‡¶≤‡ßã ‡¶ú‡¶æ‡¶§‡ßÄ‡¶Ø‡¶º ‡¶∞‡¶æ‡¶ú‡¶∏‡ßç‡¶¨ ‡¶¨‡ßã‡¶∞‡ßç‡¶° ‡¶ï‡¶∞‡ßç‡¶§‡ßÉ‡¶ï ‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞ ‡¶Ü‡¶á‡¶®, ‡ß®‡ß¶‡ß®‡ß© ‡¶è‡¶∞ ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß©‡ß™‡ß© ‡¶è‡¶∞ ‡¶Ö‡¶ß‡ßÄ‡¶®‡ßá ‡¶™‡ßç‡¶∞‡¶£‡ßÄ‡¶§ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶¨‡¶ø‡¶ß‡¶ø‡¶Æ‡¶æ‡¶≤‡¶æ ‡¶Ø‡¶æ ‡ßß ‡¶ú‡ßÅ‡¶≤‡¶æ‡¶á ‡ß®‡ß¶‡ß®‡ß™ ‡¶•‡ßá‡¶ï‡ßá ‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶ï‡¶∞ ‡¶π‡¶¨‡ßá‡•§ ‡¶è‡¶ü‡¶ø ‡¶â‡ßé‡¶∏‡ßá ‡¶ï‡¶∞ ‡¶ï‡¶∞‡ßç‡¶§‡¶®‡ßá‡¶∞ ‡¶®‡¶ø‡¶Ø‡¶º‡¶Æ‡¶æ‡¶¨‡¶≤‡ßÄ ‡¶®‡¶ø‡¶∞‡ßç‡¶ß‡¶æ‡¶∞‡¶£ ‡¶ï‡¶∞‡ßá ‡¶è‡¶¨‡¶Ç ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡¶¨‡¶∞‡ßç‡¶§‡ßÄ "‡¶â‡ßé‡¶∏‡ßá ‡¶ï‡¶∞ ‡¶¨‡¶ø‡¶ß‡¶ø‡¶Æ‡¶æ‡¶≤‡¶æ, ‡ß®‡ß¶‡ß®‡ß©" ‡¶ï‡ßá ‡¶∞‡¶π‡¶ø‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá‡•§

(‡¶ñ) ‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:  
- ‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞ ‡¶Ü‡¶á‡¶®, ‡ß®‡ß¶‡ß®‡ß© (‡ß®‡ß¶‡ß®‡ß© ‡¶∏‡¶®‡ßá‡¶∞ ‡ßß‡ß® ‡¶®‡¶Ç ‡¶Ü‡¶á‡¶®) ‡¶è‡¶∞ ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß©‡ß™‡ß©‡•§  
- ‡¶è‡¶∏.‡¶Ü‡¶∞.‡¶ì. ‡¶®‡¶Ç ‡ßß‡ß¨‡ßß-‡¶Ü‡¶á‡¶®/‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞-‡ß©‡ß¨/‡ß®‡ß¶‡ß®‡ß™, ‡¶§‡¶æ‡¶∞‡¶ø‡¶ñ: ‡ßß‡ß´ ‡¶ú‡ßç‡¶Ø‡ßà‡¶∑‡ßç‡¶†, ‡ßß‡ß™‡ß

In [9]:
response = rag_chain.invoke({"input":"‡¶ì‡¶ú‡¶® ‡¶Æ‡¶æ‡¶™‡¶æ‡¶∞ ‡¶Ø‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶â‡¶™‡¶∞ ‡¶Ö‡¶¨‡¶ö‡ßü‡ßá‡¶∞ ‡¶π‡¶æ‡¶∞¬†‡¶ï‡¶§?" })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:  
‡¶ì‡¶ú‡¶® ‡¶Æ‡¶æ‡¶™‡¶æ‡¶∞ ‡¶Ø‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶â‡¶™‡¶∞ ‡¶Ö‡¶¨‡¶ö‡¶Ø‡¶º‡ßá‡¶∞ ‡¶π‡¶æ‡¶∞ ‡ßß‡ß¶%‡•§

(‡¶ñ) ‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:  
‡¶è‡¶ü‡¶ø "‡¶§‡ßÉ‡¶§‡ßÄ‡¶Ø‡¶º ‡¶§‡¶´‡¶∏‡¶ø‡¶≤" ‡¶è‡¶∞ "‡¶Ö‡¶¨‡¶ö‡¶Ø‡¶º ‡¶≠‡¶æ‡¶§‡¶æ, ‡¶®‡¶ø‡¶É‡¶∂‡ßá‡¶∑ ‡¶≠‡¶æ‡¶§‡¶æ ‡¶ì ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶Æ‡¶∞‡ßç‡¶ü‡¶æ‡¶á‡¶ú‡ßá‡¶∏‡¶®" ‡¶Ö‡¶Ç‡¶∂ ‡ßß, "‡¶Ö‡¶¨‡¶ö‡¶Ø‡¶º ‡¶≠‡¶æ‡¶§‡¶æ ‡¶™‡¶∞‡¶ø‡¶ó‡¶£‡¶®‡¶æ" ‡¶ß‡¶æ‡¶∞‡¶æ ‡ßß(‡ßß) ‡¶è‡¶∞ ‡¶∏‡¶æ‡¶∞‡¶£‡ßÄ‡¶§‡ßá ‡¶∏‡ßç‡¶™‡¶∑‡ßç‡¶ü‡¶≠‡¶æ‡¶¨‡ßá ‡¶â‡¶≤‡ßç‡¶≤‡ßá‡¶ñ ‡¶Ü‡¶õ‡ßá, ‡¶Ø‡ßá‡¶ñ‡¶æ‡¶®‡ßá ‡¶ï‡ßÉ‡¶∑‡¶ø‡¶§‡ßá ‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡ßÉ‡¶§ ‡¶Æ‡ßÇ‡¶≤‡¶ß‡¶®‡¶ø ‡¶™‡¶∞‡¶ø‡¶∏‡¶Æ‡ßç‡¶™‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶¨‡¶ö‡¶Ø‡¶º‡ßá‡¶∞ ‡¶π‡¶æ‡¶∞ ‡¶¨‡¶∞‡ßç‡¶£‡¶ø‡¶§ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá‡•§

(‡¶ó) ‡¶¨‡¶ø‡¶∏‡ßç‡¶§‡¶æ‡¶∞‡¶ø‡¶§ ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ:  
‡¶§‡¶´‡¶∏‡¶ø‡¶≤‡ßá‡¶∞ ‡¶Ö‡¶Ç‡¶∂ ‡ßß ‡¶è‡¶∞ ‡¶Ö‡¶®‡ßÅ‡¶ö‡ßç‡¶õ‡ßá‡¶¶ ‡ßß(‡ßß) ‡¶è‡¶∞ ‡¶∏‡¶æ‡¶∞‡¶£‡ßÄ‡¶§‡ßá ‡¶ì‡¶ú‡¶® ‡¶Æ‡¶æ‡¶™‡¶æ‡¶∞ ‡¶Ø‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶Ö‡¶¨‡¶ö‡¶Ø‡¶º‡ßá‡¶∞ ‡¶π‡¶æ‡¶∞ ‡ßß‡ß¶% ‡¶®‡¶ø‡¶∞‡ßç‡

In [10]:
response = rag_chain.invoke({"input":"‡¶ï‡¶∞ ‡¶Ö‡¶¨‡¶ï‡¶æ‡¶∂ ‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§‡¶ø‡¶∞ ‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶≠‡ßå‡¶§‡¶ï‡¶æ‡¶†‡¶æ‡¶Æ‡ßã ‡¶ï‡ßã‡¶®‡¶ó‡ßÅ‡¶≤‡ßã?" })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞‡¶É  
‡¶ï‡¶∞ ‡¶Ö‡¶¨‡¶ï‡¶æ‡¶∂ ‡¶™‡ßç‡¶∞‡¶æ‡¶™‡ßç‡¶§‡¶ø‡¶∞ ‡¶Ø‡ßã‡¶ó‡ßç‡¶Ø ‡¶≠‡ßå‡¶§ ‡¶Ö‡¶¨‡¶ï‡¶æ‡¶†‡¶æ‡¶Æ‡ßã ‡¶π‡¶ø‡¶∏‡ßá‡¶¨‡ßá ‡¶®‡¶ø‡¶Æ‡ßç‡¶®‡¶≤‡¶ø‡¶ñ‡¶ø‡¶§ ‡¶∏‡ßÅ‡¶¨‡¶ø‡¶ß‡¶æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¨‡¶ø‡¶¨‡ßá‡¶ö‡¶ø‡¶§ ‡¶π‡¶¨‡ßá: ‡¶ó‡¶≠‡ßÄ‡¶∞ ‡¶∏‡¶Æ‡ßÅ‡¶¶‡ßç‡¶∞ ‡¶¨‡¶®‡ßç‡¶¶‡¶∞, ‡¶∏‡¶Æ‡ßÅ‡¶¶‡ßç‡¶∞ ‡¶¨‡¶®‡ßç‡¶¶‡¶∞ ‡¶¨‡¶æ ‡¶®‡¶¶‡ßÄ ‡¶¨‡¶®‡ßç‡¶¶‡¶∞; ‡¶è‡¶≤‡¶ø‡¶≠‡ßá‡¶ü‡ßá‡¶° ‡¶è‡¶ï‡ßç‡¶∏‡¶™‡ßç‡¶∞‡ßá‡¶∏‡¶ì‡¶Ø‡¶º‡ßá; ‡¶∞‡¶™‡ßç‡¶§‡¶æ‡¶®‡¶ø ‡¶™‡ßç‡¶∞‡¶ï‡ßç‡¶∞‡¶ø‡¶Ø‡¶º‡¶æ‡¶ï‡¶∞‡¶£ ‡¶Ö‡¶û‡ßç‡¶ö‡¶≤; ‡¶´‡ßç‡¶≤‡¶æ‡¶á‡¶ì‡¶≠‡¶æ‡¶∞; ‡¶ü‡ßã‡¶≤‡¶∞‡ßã‡¶° ‡¶ì ‡¶¨‡ßç‡¶∞‡¶ø‡¶ú; ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∏ ‡¶™‡¶æ‡¶á‡¶™ ‡¶≤‡¶æ‡¶á‡¶®; ‡¶Ü‡¶á‡¶∏‡¶ø‡¶ü‡¶ø ‡¶™‡¶æ‡¶∞‡ßç‡¶ï, ‡¶ú‡ßã‡¶® ‡¶¨‡¶æ ‡¶≠‡¶ø‡¶≤‡ßá‡¶ú; ‡¶π‡¶æ‡¶á‡¶ü‡ßá‡¶ï ‡¶™‡¶æ‡¶∞‡ßç‡¶ï; ‡¶Ö‡¶®‡ßÅ‡¶Æ‡ßã‡¶¶‡¶ø‡¶§ ‡¶™‡¶æ‡¶®‡¶ø ‡¶∂‡ßã‡¶ß‡¶®‡¶æ‡¶ó‡¶æ‡¶∞; ‡¶™‡¶æ‡¶®‡¶ø ‡¶∏‡¶∞‡¶¨‡¶∞‡¶æ‡¶π ‡¶¨‡¶æ ‡¶™‡¶æ‡¶®‡¶ø ‡¶®‡¶ø‡¶∑‡ßç‡¶ï‡¶æ‡¶∂‡¶® ‡¶¨‡ßç‡¶Ø‡¶¨‡¶∏‡ßç‡¶•‡¶æ; ‡¶§‡¶∞‡¶≤‡¶æ‡¶Ø‡¶º‡¶ø‡¶§ ‡¶™‡ßç‡¶∞‡¶æ‡¶ï‡ßÉ‡¶§‡¶ø‡¶ï ‡¶ó‡ßç‡¶Ø‡¶æ‡¶∏ (‡¶è‡¶≤‡¶è‡¶

In [37]:
response = rag_chain.invoke({"input":"‡¶§‡¶π‡¶¨‡¶ø‡¶≤ ‡¶π‡¶á‡¶§‡ßá ‡¶Ü‡ßü‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶ø ‡¶™‡¶∞‡¶ø‡¶Æ‡¶æ‡¶® ‡¶Ö‡¶∞‡ßç‡¶• ‡¶ï‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶Ö‡¶¨‡ßç‡¶Ø‡¶æ‡¶π‡¶§‡¶ø¬†‡¶™‡¶æ‡¶¨‡ßá?" })
answer = response.get("answer", "No answer found")
print(answer)

‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:  
‡¶§‡¶π‡¶¨‡¶ø‡¶≤ ‡¶π‡¶á‡¶§‡ßá ‡¶Ü‡ßü‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶∂‡¶∞‡ßç‡¶§‡ßá ‡¶ï‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶Ö‡¶¨‡ßç‡¶Ø‡¶æ‡¶π‡¶§‡¶ø ‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ ‡¶Ø‡¶æ‡¶Ø‡¶º‡•§ ‡¶Ø‡¶¶‡¶ø ‡¶§‡¶π‡¶¨‡¶ø‡¶≤‡ßá‡¶∞ ‡¶Ü‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶¶ ‡¶¨‡¶æ ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶®‡¶æ‡¶Æ‡ßá ‡¶π‡¶Ø‡¶º ‡¶è‡¶¨‡¶Ç ‡¶ö‡¶æ‡¶Å‡¶¶‡¶æ‡¶∞ ‡¶Ö‡¶Ç‡¶∂ ‡¶¨‡ßç‡¶Ø‡¶§‡ßÄ‡¶§ ‡¶π‡¶Ø‡¶º, ‡¶§‡¶¨‡ßá ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶∏‡ßÄ‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶Ö‡¶∞‡ßç‡¶• ‡¶ï‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶Ö‡¶¨‡ßç‡¶Ø‡¶æ‡¶π‡¶§‡¶ø ‡¶™‡ßá‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡•§  

‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:  
‡¶ß‡¶æ‡¶∞‡¶æ ‡ßß‡ß´‡ß™(‡ßß) ‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡¶Ø‡¶º‡ßÄ, ‡¶∏‡ßç‡¶¨‡ßÄ‡¶ï‡ßÉ‡¶§ ‡¶≠‡¶¨‡¶ø‡¶∑‡ßç‡¶Ø ‡¶§‡¶π‡¶¨‡¶ø‡¶≤ ‡¶¨‡¶æ ‡¶Ö‡¶®‡ßÅ‡¶Æ‡ßã‡¶¶‡¶ø‡¶§ ‡¶§‡¶π‡¶¨‡¶ø‡¶≤‡ßá‡¶∞ ‡¶Ü‡¶Ø‡¶º ‡¶ï‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶Ö‡¶¨‡ßç‡¶Ø‡¶æ‡¶π‡¶§‡¶ø ‡¶™‡ßá‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡•§ ‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡ßá, ‡¶Ø‡¶¶‡¶ø ‡¶§‡¶π‡¶¨‡¶ø‡¶≤‡ßá‡¶∞ ‡¶Ü‡¶Ø‡¶º ‡¶∏‡ßÅ‡¶¶ ‡¶¨‡¶æ ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶®‡¶æ‡¶Æ‡ßá ‡¶π‡¶Ø‡¶º ‡¶è‡¶¨‡¶Ç ‡¶ö‡¶æ‡¶Å‡¶¶‡¶æ‡¶∞ ‡¶Ö

In [15]:
response = rag_chain.invoke({"input":"‡¶¨‡¶®‡¶æ‡¶®‡ßÄ ‡¶è‡¶≤‡¶æ‡¶ï‡¶æ‡ßü ‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç ‡¶¨‡¶æ ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡ßç‡¶Æ‡ßá‡¶®‡ßç‡¶ü‡ßá ‡¶¨‡¶ø‡¶®‡¶ø‡ßü‡ßã‡¶ó‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá¬†‡¶ï‡¶∞¬†‡¶ï‡¶§?" })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:
‡¶¨‡¶®‡¶æ‡¶®‡ßÄ ‡¶è‡¶≤‡¶æ‡¶ï‡¶æ‡ßü ‡¶Ö‡¶®‡¶ß‡¶ø‡¶ï ‡ß®‡ß¶‡ß¶ ‡¶¨‡¶∞‡ßç‡¶ó‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞ ‡¶™‡ßç‡¶≤‡¶ø‡¶®‡ßç‡¶• ‡¶Ü‡¶Ø‡¶º‡¶§‡¶® ‡¶¨‡¶ø‡¶∂‡¶ø‡¶∑‡ßç‡¶ü ‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç ‡¶¨‡¶æ ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡¶Æ‡ßá‡¶®‡ßç‡¶ü‡ßá ‡¶¨‡¶ø‡¶®‡¶ø‡ßü‡ßã‡¶ó‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶∞‡¶π‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶§‡¶ø ‡¶¨‡¶∞‡ßç‡¶ó ‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞‡ßá ‡ß™,‡ß¶‡ß¶‡ß¶ (‡¶ö‡¶æ‡¶∞ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞) ‡¶ü‡¶æ‡¶ï‡¶æ ‡¶è‡¶¨‡¶Ç ‡ß®‡ß¶‡ß¶ ‡¶¨‡¶∞‡ßç‡¶ó‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞‡ßá‡¶∞ ‡¶Ö‡¶ß‡¶ø‡¶ï ‡¶π‡¶≤‡ßá ‡¶™‡ßç‡¶∞‡¶§‡¶ø ‡¶¨‡¶∞‡ßç‡¶ó ‡¶Æ‡¶ø‡¶ü‡¶æ‡¶∞‡ßá ‡ß¨,‡ß¶‡ß¶‡ß¶ (‡¶õ‡¶Ø‡¶º ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞) ‡¶ü‡¶æ‡¶ï‡¶æ‡•§

(‡¶ñ) ‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:
‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶§‡¶´‡¶∏‡¶ø‡¶≤, ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ß™ ‡¶¶‡ßç‡¶∞‡¶∑‡ßç‡¶ü‡¶¨‡ßç‡¶Ø, ‡¶Ö‡¶Ç‡¶∂ ‡ßß, ‡¶∏‡¶æ‡¶∞‡¶£‡ßÄ ‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡ßü‡ßÄ ‡¶¨‡¶®‡¶æ‡¶®‡ßÄ ‡¶è‡¶≤‡¶æ‡¶ï‡¶æ‡ßü ‡¶¨‡¶ø‡¶®‡¶ø‡ßü‡ßã‡¶ó‡¶ï‡ßÉ‡¶§ ‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç ‡¶¨‡¶æ ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡¶Æ‡ßá‡¶®‡ßç‡¶ü‡ßá‡¶∞ ‡¶ï‡¶∞‡¶π‡¶æ‡¶∞ ‡¶®‡¶ø‡¶∞‡ßç‡¶ß‡¶æ‡¶∞‡¶ø‡¶§ ‡¶π‡ßü‡

In [47]:
response = rag_chain.invoke({"input":"‡¶Ü‡¶™‡¶ø‡¶≤  ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßÅ‡¶®‡ßç‡¶Ø‡¶æ‡¶≤‡ßá ‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶ï‡¶∞‡¶æ‡¶∞ ‡¶®‡¶ø‡ßü‡¶Æ¬†‡¶ï‡¶ø?" })
answer = response.get("answer", "No answer found")
print(answer)

**‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:**
‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßç‡¶Ø‡ßÅ‡¶®‡¶æ‡¶≤‡ßá ‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶≤‡ßá, ‡¶®‡¶ø‡¶∞‡ßç‡¶ß‡¶æ‡¶∞‡¶ø‡¶§ ‡¶´‡¶∞‡¶Æ‡ßá ‡¶è‡¶¨‡¶Ç ‡¶™‡¶¶‡ßç‡¶ß‡¶§‡¶ø‡¶§‡ßá, ‡¶Ü‡¶™‡¶ø‡¶≤‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶∏‡¶Æ‡¶Ø‡¶º‡¶∏‡ßÄ‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá, ‡¶Ø‡¶•‡¶æ‡¶Ø‡¶• ‡¶´‡¶ø ‡¶¶‡¶ø‡¶Ø‡¶º‡ßá, ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶¶‡¶æ‡¶ñ‡¶ø‡¶≤ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶¨‡ßá‡•§ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶®‡¶™‡¶§‡ßç‡¶∞‡ßá‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶™‡ßç‡¶∞‡¶æ‡¶∏‡¶ô‡ßç‡¶ó‡¶ø‡¶ï ‡¶¶‡¶≤‡¶ø‡¶≤‡¶æ‡¶¶‡¶ø ‡¶∏‡¶Ç‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶¨‡ßá ‡¶è‡¶¨‡¶Ç ‡¶®‡¶ø‡¶∞‡ßç‡¶ß‡¶æ‡¶∞‡¶ø‡¶§ ‡¶´‡¶∞‡¶Æ‡ßç‡¶Ø‡¶æ‡¶ü‡ßá ‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶¶‡¶æ‡¶ñ‡¶ø‡¶≤‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡ßá‡¶∂‡¶®‡¶æ ‡¶Ö‡¶®‡ßÅ‡¶∏‡¶∞‡¶£ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶¨‡ßá‡•§ ‡¶è‡¶õ‡¶æ‡¶°‡¶º‡¶æ‡¶ì, ‡¶Ü‡¶™‡¶ø‡¶≤‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶∂‡¶∞‡ßç‡¶§ ‡¶ì ‡¶™‡ßç‡¶∞‡¶ï‡ßç‡¶∞‡¶ø‡¶Ø‡¶º‡¶æ ‡¶Ö‡¶®‡ßÅ‡¶∏‡¶∞‡¶£ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶¨‡ßá‡•§

**‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:**
‡¶Ü‡¶á‡¶® ‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡¶Ø‡¶º‡ßÄ,

In [49]:
response = rag_chain.invoke({"input":"‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶®‡¶ø‡¶∏‡ßç‡¶™‡¶§‡ßç‡¶§‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá ‡¶ï‡¶ø ‡¶™‡¶¶‡ßç‡¶ß‡¶§‡¶ø ‡¶Ö‡¶¨‡¶≤‡¶Æ‡ßç‡¶¨‡¶® ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶¨‡ßá?" })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:
‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶®‡¶ø‡¶∏‡ßç‡¶™‡¶§‡ßç‡¶§‡¶ø‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶Ü‡¶™‡¶ø‡¶≤ ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßç‡¶Ø‡ßÅ‡¶®‡¶æ‡¶≤ ‡¶®‡¶ø‡¶∞‡ßç‡¶ß‡¶æ‡¶∞‡¶ø‡¶§ ‡¶§‡¶æ‡¶∞‡¶ø‡¶ñ‡ßá ‡¶∂‡ßÅ‡¶®‡¶æ‡¶®‡¶ø ‡¶ï‡¶∞‡ßá, ‡¶™‡¶ï‡ßç‡¶∑‡¶¶‡ßá‡¶∞ ‡¶∂‡ßÅ‡¶®‡ßá ‡¶è‡¶¨‡¶Ç ‡¶™‡ßç‡¶∞‡¶Ø‡¶º‡ßã‡¶ú‡¶®‡ßÄ‡¶Ø‡¶º ‡¶¶‡¶≤‡¶ø‡¶≤‡¶æ‡¶¶‡¶ø ‡¶ì ‡¶∏‡¶æ‡¶ï‡ßç‡¶∑‡ßç‡¶Ø ‡¶ó‡ßç‡¶∞‡¶π‡¶£‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶∏‡¶ø‡¶¶‡ßç‡¶ß‡¶æ‡¶®‡ßç‡¶§ ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡•§ ‡¶∏‡¶ø‡¶¶‡ßç‡¶ß‡¶æ‡¶®‡ßç‡¶§‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶∏‡¶Ç‡¶ñ‡ßç‡¶Ø‡¶æ‡¶ó‡¶∞‡¶ø‡¶∑‡ßç‡¶† ‡¶Æ‡¶§‡¶æ‡¶Æ‡¶§ ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡•§ ‡¶Ø‡¶¶‡¶ø ‡¶Æ‡¶§‡¶≠‡ßá‡¶¶ ‡¶π‡¶Ø‡¶º, ‡¶§‡¶¨‡ßá ‡¶≤‡¶ø‡¶ñ‡¶ø‡¶§ ‡¶∏‡¶ø‡¶¶‡ßç‡¶ß‡¶æ‡¶®‡ßç‡¶§ ‡¶¶‡ßá‡¶Ø‡¶º‡¶æ ‡¶π‡¶Ø‡¶º‡•§ ‡¶è‡¶õ‡¶æ‡¶°‡¶º‡¶æ, ‡¶¨‡ßá‡¶û‡ßç‡¶ö‡ßá‡¶∞ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø‡¶ó‡¶£ ‡¶∏‡¶Æ‡¶æ‡¶®‡¶≠‡¶æ‡¶¨‡ßá ‡¶¨‡¶ø‡¶≠‡¶ï‡ßç‡¶§ ‡¶π‡¶≤‡ßá ‡¶™‡ßç‡¶∞‡ßá‡¶∏‡¶ø‡¶°‡ßá‡¶®‡ßç‡¶ü‡ßá‡¶∞ ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡ßá‡¶∂‡ßá ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø ‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡•§ ‡¶ï‡¶æ‡¶∞‡ßç

In [16]:
response = rag_chain.invoke({"input":"‡¶Ü‡ßü ‡¶ó‡ßã‡¶™‡¶® ‡¶ï‡¶∞‡¶≤‡ßá ‡¶ï‡¶ø ‡¶™‡¶∞‡¶ø‡¶Æ‡¶æ‡¶® ‡¶ú‡¶∞‡¶ø‡¶Æ‡¶æ‡¶®‡¶æ ‡¶π‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá? " })
answer = response.get("answer", "No answer found")
print(answer)

(‡¶ï) ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∑‡¶ø‡¶™‡ßç‡¶§ ‡¶â‡¶§‡ßç‡¶§‡¶∞:
‡¶Ü‡ßü ‡¶ó‡ßã‡¶™‡¶® ‡¶ï‡¶∞‡¶≤‡ßá ‡¶ï‡¶∞‡¶¶‡¶æ‡¶§‡¶æ‡¶∞ ‡¶â‡¶™‡¶∞ ‡¶´‡¶æ‡¶Å‡¶ï‡¶ø ‡¶¶‡ßá‡¶ì‡¶Ø‡¶º‡¶æ ‡¶Ö‡¶ô‡ßç‡¶ï‡ßá‡¶∞ ‡ßß‡ß´% √ó (‡ßß + ‡ßß‡ß¶% √ó ‡¶ó) ‡¶™‡¶∞‡¶ø‡¶Æ‡¶æ‡¶® ‡¶ú‡¶∞‡¶ø‡¶Æ‡¶æ‡¶®‡¶æ ‡¶Ü‡¶∞‡ßã‡¶™‡¶ø‡¶§ ‡¶π‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá, ‡¶Ø‡ßá‡¶ñ‡¶æ‡¶®‡ßá ‡¶ó ‡¶π‡¶≤‡ßã ‡¶Ö‡¶∏‡¶§‡ßç‡¶Ø ‡¶§‡¶•‡ßç‡¶Ø ‡¶™‡ßç‡¶∞‡¶¶‡¶∞‡ßç‡¶∂‡¶®‡ßá‡¶∞ ‡¶¨‡¶õ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶â‡¶¶‡¶ò‡¶æ‡¶ü‡¶ø‡¶§ ‡¶¨‡¶õ‡¶∞ ‡¶™‡¶∞‡ßç‡¶Ø‡¶®‡ßç‡¶§ ‡¶Æ‡ßã‡¶ü ‡¶¨‡¶õ‡¶∞ ‡¶∏‡¶Ç‡¶ñ‡ßç‡¶Ø‡¶æ‡•§

(‡¶ñ) ‡¶Ü‡¶á‡¶®‡¶ø ‡¶≠‡¶ø‡¶§‡ßç‡¶§‡¶ø:
‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞ ‡¶Ü‡¶á‡¶®‡ßá‡¶∞ ‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ß≠‡ß® ‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡¶Ø‡¶º‡ßÄ, ‡¶Ü‡¶Ø‡¶º ‡¶ó‡ßã‡¶™‡¶® ‡¶¨‡¶æ ‡¶Ö‡¶∏‡¶§‡ßç‡¶Ø ‡¶§‡¶•‡ßç‡¶Ø ‡¶™‡ßç‡¶∞‡¶¶‡¶∞‡ßç‡¶∂‡¶®‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶ú‡¶∞‡¶ø‡¶Æ‡¶æ‡¶®‡¶æ ‡¶ß‡¶æ‡¶∞‡ßç‡¶Ø ‡¶ï‡¶∞‡¶æ ‡¶π‡¶Ø‡¶º‡•§

(‡¶ó) ‡¶¨‡¶ø‡¶∏‡ßç‡¶§‡¶æ‡¶∞‡¶ø‡¶§ ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ñ‡ßç‡¶Ø‡¶æ:
‡ßß. ‡¶Ø‡¶¶‡¶ø ‡¶ï‡ßã‡¶®‡ßã ‡¶¨‡ßç‡¶Ø‡¶ï‡ßç‡¶§‡¶ø ‡¶ï‡¶∞‡¶¶‡¶æ‡¶§‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶¶‡ßá‡¶Ø‡¶º ‡¶Ü‡¶Ø‡¶º, ‡¶∏‡¶Æ‡ßç‡¶™‡¶¶, ‡¶¶‡¶æ‡

In [13]:
# Print retrieved context details
context_docs = response.get("context", [])
print(f"üìö Retrieved {len(context_docs)} relevant chunks:")
print(context_docs)

üìö Retrieved 15 relevant chunks:
[Document(id='487d8d47-f58c-49de-bd01-57a9919869d8', metadata={'chunk_id': 'chunk_001', 'chunk_index': 0.0, 'chunk_tokens': 1670.0, 'chunk_type': 'special tax rates on investment', 'keywords': ['‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó', '‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞', '‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç', '‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡¶Æ‡ßá‡¶®‡ßç‡¶ü', '‡¶ï‡¶∞‡¶π‡¶æ‡¶∞', '‡¶™‡ßç‡¶≤‡¶ø‡¶®‡ßç‡¶• ‡¶Ü‡¶Ø‡¶º‡¶§‡¶®'], 'part_chapter': '‡¶Ö‡¶Ç‡¶∂ ‡ßß', 'section_range': 'Section 24 (Reference)', 'source': 'Doc\\11. Income Tax Act, 2023 (22 June 2023)__split__11.txt', 'total_chunks': 26.0}, page_content='‡¶™‡ßÉ‡¶∑‡ßç‡¶†‡¶æ/Page 248\n\n-------------------------------------------------\n\n‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶§‡¶´‡¶∏‡¶ø‡¶≤\n\n‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó‡ßá ‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡¶π‡¶æ‡¶∞\n\n[‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ß™ ‡¶¶‡ßç‡¶∞‡¶∑‡ßç‡¶ü‡¶¨‡ßç‡¶Ø]\n\n‡¶Ö‡¶Ç‡¶∂ ‡ßß\n\n‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞ ‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡¶®‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó ‡¶™‡ßç‡¶∞‡¶¶‡¶∞‡ßç‡¶∂‡¶®\n\n‡ßß‡•§ ‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡

In [14]:
# Display just the content in readable format
for i, doc in enumerate(context_docs, 1):
    print(f"\n--- Chunk {i} ---")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")
    print(f"Content: {doc.page_content}")
    print("-" * 40)


--- Chunk 1 ---
Source: Doc\11. Income Tax Act, 2023 (22 June 2023)__split__11.txt
Content: ‡¶™‡ßÉ‡¶∑‡ßç‡¶†‡¶æ/Page 248

-------------------------------------------------

‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶§‡¶´‡¶∏‡¶ø‡¶≤

‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó‡ßá ‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞‡¶π‡¶æ‡¶∞

[‡¶ß‡¶æ‡¶∞‡¶æ ‡ß®‡ß™ ‡¶¶‡ßç‡¶∞‡¶∑‡ßç‡¶ü‡¶¨‡ßç‡¶Ø]

‡¶Ö‡¶Ç‡¶∂ ‡ßß

‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞ ‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡¶®‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó ‡¶™‡ßç‡¶∞‡¶¶‡¶∞‡ßç‡¶∂‡¶®

‡ßß‡•§ ‡¶¨‡¶ø‡¶∂‡ßá‡¶∑ ‡¶ï‡¶∞ ‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡¶®‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç ‡¶¨‡¶æ ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡¶Æ‡ßá‡¶®‡ßç‡¶ü ‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó ‡¶™‡ßç‡¶∞‡¶¶‡¶∞‡ßç‡¶∂‡¶®‡•§‚Äî(‡ßß) ‡¶ï‡ßã‡¶®‡ßã ‡¶∏‡ßç‡¶¨‡¶æ‡¶≠‡¶æ‡¶¨‡¶ø‡¶ï ‡¶¨‡ßç‡¶Ø‡¶ï‡ßç‡¶§‡¶ø ‡¶¨‡¶ø‡¶≤‡ßç‡¶°‡¶ø‡¶Ç ‡¶¨‡¶æ ‡¶Ö‡ßç‡¶Ø‡¶æ‡¶™‡¶æ‡¶∞‡ßç‡¶ü‡¶Æ‡ßá‡¶®‡ßç‡¶ü ‡¶®‡¶ø‡¶∞‡ßç‡¶Æ‡¶æ‡¶£ ‡¶¨‡¶æ ‡¶ï‡ßç‡¶∞‡¶Ø‡¶º‡ßá ‡¶ï‡ßã‡¶®‡ßã ‡¶Ö‡¶∞‡ßç‡¶• ‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó ‡¶ï‡¶∞‡¶ø‡¶≤‡ßá ‡¶â‡¶ï‡ßç‡¶§ ‡¶¨‡¶ø‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó‡¶ï‡ßÉ‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•‡ßá‡¶∞ ‡¶â‡ßé‡¶∏ ‡¶∏‡¶Æ‡ßç‡

In [None]:
# # ==========================================
# # STEP 7: Test the Enhanced System
# # ==========================================

# def test_enhanced_rag(query: str):
#     """Test the enhanced RAG system with detailed output"""
#     print(f"\nüîç Testing Query: '{query}'")
#     print("=" * 60)
    
#     # Get response
#     response = rag_chain.invoke({"input": query})
#     answer = response.get("answer", "No answer found")
    
#     # Print retrieved context details
#     context_docs = response.get("context", [])
#     print(f"üìö Retrieved {len(context_docs)} relevant chunks:")
    
#     for i, doc in enumerate(context_docs[:3]):  # Show first 3
#         metadata = doc.metadata
#         print(f"\nChunk {i+1}:")
#         print(f"  üìÑ Source: {metadata.get('source', 'Unknown')}")
#         print(f"  üìä Type: {metadata.get('chunk_type', 'Unknown')}")
#         print(f"  üèõÔ∏è Act: {metadata.get('act_name', 'Not specified')}")
#         print(f"  üìã Section: {metadata.get('section_range', 'Not specified')}")
#         print(f"  üî§ Keywords: {metadata.get('keywords', [])}")
#         print(f"  üìù Content preview: {doc.page_content[:150]}...")
    
#     print(f"\nü§ñ Generated Answer:")
#     print("-" * 40)
#     print(answer)
#     print("=" * 60)

# # Test the system
# test_queries = [
#     "‡¶ï‡ßã‡¶Æ‡ßç‡¶™‡¶æ‡¶®‡¶ø ‡¶¨‡¶≤‡¶§‡ßá ‡¶ï‡ßã‡¶® ‡¶ï‡ßã‡¶® ‡¶∏‡¶§‡ßç‡¶§‡¶æ ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§?",
#     "‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞ ‡¶π‡¶æ‡¶∞ ‡¶ï‡¶§?",
#     "‡¶™‡¶∞‡¶ø‡¶ö‡¶æ‡¶≤‡¶ï ‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó‡ßá‡¶∞ ‡¶®‡¶ø‡¶Ø‡¶º‡¶Æ ‡¶ï‡¶ø?",
# ]

# for query in test_queries:
#     test_enhanced_rag(query)

# print("\nüéâ LLM-based Chunking RAG System Ready!")
# print("‚ú® Features:")
# print("  - Intelligent legal document chunking")
# print("  - Hierarchical structure preservation") 
# print("  - Enhanced metadata extraction")
# print("  - Context-aware retrieval")
# print("  - Bilingual support (Bangla/English)")

In [24]:
def ask_legal_question(question: str, show_context: bool = False, k: int = 5):
    """
    Simple interface for users to ask legal questions
    
    Args:
        question (str): The legal question in Bangla or English
        show_context (bool): Whether to show retrieved context chunks
        k (int): Number of relevant chunks to retrieve
    
    Returns:
        str: The legal assistant's answer
    """
    
    if not question.strip():
        return "‡¶¶‡¶Ø‡¶º‡¶æ ‡¶ï‡¶∞‡ßá ‡¶è‡¶ï‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® ‡¶≤‡¶ø‡¶ñ‡ßÅ‡¶®‡•§ / Please enter a question."
    
    try:
        print(f"\nüîç ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® / Question: {question}")
        print("=" * 60)
        
        # Update retriever with new k value if different
        if k != 5:
            global retriever
            retriever = vectorstore.as_retriever(
                search_type="similarity", 
                search_kwargs={'k': k}
            )
        
        # Get response from RAG chain
        response = rag_chain.invoke({"input": question})
        answer = response.get("answer") or response.get("result") or str(response)
        
        # Show context if requested
        if show_context:
            context_docs = response.get("context", [])
            print(f"\nüìö Retrieved {len(context_docs)} relevant chunks:")
            print("-" * 40)
            
            for i, doc in enumerate(context_docs):
                metadata = doc.metadata
                print(f"\nChunk {i+1}:")
                print(f"  üìÑ Source: {metadata.get('source', 'Unknown')}")
                print(f"  üèõÔ∏è Act: {metadata.get('act_name', 'Not specified')}")
                print(f"  üìã Section: {metadata.get('section_range', 'Not specified')}")
                print(f"  üìù Preview: {doc.page_content[:150]}...")
                print("  " + "-" * 35)
        
        print(f"\nü§ñ ‡¶â‡¶§‡ßç‡¶§‡¶∞ / Answer:")
        print("-" * 40)
        print(answer)
        print("=" * 60)
        
        return answer
        
    except Exception as e:
        error_msg = f"‚ùå Error processing question: {str(e)}"
        print(error_msg)
        return error_msg

def interactive_legal_assistant():
    """
    Interactive mode - continuous question answering
    """
    print("\nüèõÔ∏è ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶á‡¶®‡¶ø ‡¶∏‡¶π‡¶æ‡¶Ø‡¶º‡¶ï / Bangladesh Legal Assistant")
    print("=" * 60)
    print("üìù Instructions:")
    print("  - Ask questions in Bangla or English")
    print("  - Type 'exit' or 'quit' to stop")
    print("  - Type 'context' to show retrieved context")
    print("  - Type 'help' for more commands")
    print("=" * 60)
    
    show_context = False
    
    while True:
        try:
            user_input = input("\n‚ùì ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶∂‡ßç‡¶® / Your Question: ").strip()
            
            if user_input.lower() in ['exit', 'quit', '‡¶¨‡ßá‡¶∞ ‡¶π‡¶®', '‡¶¨‡¶®‡ßç‡¶ß']:
                print("\nüëã ‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶! / Thank you!")
                break
                
            elif user_input.lower() in ['context', '‡¶ï‡¶®‡¶ü‡ßá‡¶ï‡ßç‡¶∏‡¶ü']:
                show_context = not show_context
                status = "ON" if show_context else "OFF"
                print(f"üìö Context display: {status}")
                continue
                
            elif user_input.lower() in ['help', '‡¶∏‡¶æ‡¶π‡¶æ‡¶Ø‡ßç‡¶Ø']:
                print("\nüìã Available commands:")
                print("  - context: Toggle context display")
                print("  - exit/quit: Exit the assistant")
                print("  - help: Show this help")
                print("  - Just ask any legal question!")
                continue
                
            elif not user_input:
                print("‚ö†Ô∏è Please enter a question.")
                continue
            
            # Process the question
            ask_legal_question(user_input, show_context=show_context)
            
        except KeyboardInterrupt:
            print("\n\nüëã Assistant stopped. ‡¶ß‡¶®‡ßç‡¶Ø‡¶¨‡¶æ‡¶¶! / Thank you!")
            break
        except Exception as e:
            print(f"\n‚ùå Unexpected error: {e}")

# # Quick test function
# def test_legal_rag():
#     """Test the legal RAG system with sample questions"""
    
#     test_questions = [
#         "‡¶ï‡ßã‡¶Æ‡ßç‡¶™‡¶æ‡¶®‡¶ø ‡¶¨‡¶≤‡¶§‡ßá ‡¶ï‡ßã‡¶® ‡¶ï‡ßã‡¶® ‡¶∏‡¶§‡ßç‡¶§‡¶æ ‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§?",
#         "‡¶Ü‡¶Ø‡¶º‡¶ï‡¶∞ ‡¶π‡¶æ‡¶∞ ‡¶ï‡¶§?",
#         "‡¶™‡¶∞‡¶ø‡¶ö‡¶æ‡¶≤‡¶ï ‡¶®‡¶ø‡¶Ø‡¶º‡ßã‡¶ó‡ßá‡¶∞ ‡¶®‡¶ø‡¶Ø‡¶º‡¶Æ ‡¶ï‡¶ø?",
#         "What is the definition of company?",
#         "Tax rates in Bangladesh"
#     ]
    
#     print("\nüß™ Testing Legal RAG System with Sample Questions:")
#     print("=" * 60)
    
#     for i, question in enumerate(test_questions, 1):
#         print(f"\nüîç Test {i}: {question}")
#         answer = ask_legal_question(question)
#         print("\n" + "=" * 60)