In [None]:
# Install required packages
%pip install langchain
%pip install langchain_community
%pip install unstructured
%pip install langchain_openai
%pip install langchain_groq
%pip install langchain_pinecone
%pip install python-magic-bin
%pip install python-dotenv
%pip install rank_bm25



In [1]:
import os
import json
import tiktoken
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_groq import ChatGroq
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
load_dotenv()

# ---------- Tokenizer ----------
tokenizer = tiktoken.get_encoding("cl100k_base")
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

In [3]:
# ==========================================
# STEP 1: Load Documents
# ==========================================

# Load all text files from directory
dir_loader = DirectoryLoader(
    "SRO",
    glob="**/*.txt",  
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True
)

documents = dir_loader.load() 

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")
    print(f"  Tokens: {count_tokens(doc.page_content)}")

100%|██████████| 3/3 [00:04<00:00,  1.35s/it]

Loaded 3 documents

Document 1:
  Source: SRO\05. SRO - Amendment of Sixth Schedule (09 October 2024)_complete_transcription.txt
  Length: 8017 characters
  Tokens: 8460

Document 2:
  Source: SRO\06. SRO - Amendment of Sixth Schedule (03 December 2024)_complete_transcription.txt
  Length: 1400 characters
  Tokens: 1192

Document 3:
  Source: SRO\07. SRO - Amendment of Sixth Schedule (17 December 2024)_complete_transcription.txt
  Length: 1363 characters
  Tokens: 1147





In [4]:
# ==========================================
# STEP 2: Setup OpenAI Embeddings & LLM for Chunking
# ==========================================

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=os.getenv("OPENAI_API_KEY")
)

# Initialize LLM for chunking (using OpenAI for better instruction following)
chunking_llm_sro = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model="gpt-4.1",  # Use GPT-4 for better chunking quality
    temperature=0.1,
    max_tokens=None
)

print("OpenAI Embedding Model and Chunking LLM loaded successfully!")
print(chunking_llm_sro)
# Test embeddings
query_result = embeddings.embed_query("Hello world")
print("Embedding dimension:", len(query_result))

OpenAI Embedding Model and Chunking LLM loaded successfully!
client=<openai.resources.chat.completions.completions.Completions object at 0x000001F02FD09FD0> async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x000001F02FD0A3C0> root_client=<openai.OpenAI object at 0x000001F02FDAD810> root_async_client=<openai.AsyncOpenAI object at 0x000001F02FDADE50> model_name='gpt-4.1' temperature=0.1 model_kwargs={} openai_api_key=SecretStr('**********')
Embedding dimension: 3072


In [None]:
#LLM will decide the how to chunk and generate chunks based on the prompt

# chunking_system_prompt_sro = """
# You are an expert in Bangladesh Gazette SROs (Statutory Regulatory Orders) and legal text processing.
# Your task is to intelligently CHUNK **SRO/প্রজ্ঞাপন** documents that amend the **Sixth Schedule (ষষ্ঠ তফসিল)** of the Income-tax Act, 2023 (আয়কর আইন, ২০২৩), while preserving legal hierarchy, semantics, and exact quoted wording.

# === WHAT AN SRO LOOKS LIKE (Bangladesh Gazette) ===
# Typical elements you will encounter (keep them intact and correctly grouped):
# 1) Gazette & header block: “বাংলাদেশ গেজেট / অতিরিক্ত সংখ্যা / কর্তৃপক্ষ কর্তৃক প্রকাশিত”, date line(s),
#    Ministry/Division + NBR, document type “প্রজ্ঞাপন”, Bangla+Gregorian issue dates, **SRO number** (“এস. আর. ও নং …”).
# 2) Authority & target: reference to **আয়কর আইন, ২০২৩, ধারা ৩৪১** (or other authority) and the target instrument
#    (e.g., **ষষ্ঠ তফসিল**, with Part/Section/Clause/Sub-clause identifiers).
# 3) Amendment directives: concrete legal actions such as “বিলুপ্ত করিল”, “সন্নিবেশিত হইবে”, “প্রতিস্থাপিত হইবে”,
#    including the exact Bangla quotation text to be inserted/omitted/replaced.
# 4) Compliance / conditions / cross-refs: e.g., obligations under **ধারা ৭৬(৫) ও (৬)** or other references.
# 5) Effectivity & validity: “অবিলম্বে কার্যকর” and, if present, sunset date (e.g., “৩০ জুন, ২০২৯ তারিখ পর্যন্ত বলবৎ”).
# 6) Sign-off & footer: “জাতীয় রাজস্ব বোর্ডের আদেশক্রমে…”, officer/secretary name/designation,
#    printer/publisher lines, Gazette website, price.

# === CHUNKING RULES (SRO-specific) ===
# A. Preserve legal structure and never split a single amendment directive across chunks.
# B. Keep exact punctuation/quotes, Bangla digits, and diacritics. DO NOT paraphrase amendment text.
# C. Optimal chunk size: target ~1000–2000 tokens; HARD MAX 3000 tokens per chunk.
# D. Short SRO (≤800–1000 tokens): produce a single chunk that still includes all mandatory metadata below.
# E. Grouping guidance (adjust if document length dictates):
#    - Chunk 1: Gazette header + Ministry/Division + NBR + “প্রজ্ঞাপন” + SRO No. + Bangla & Gregorian dates +
#      legal authority (e.g., ITA-2023 s.341) + What is being amended (e.g., Sixth Schedule, Part/Clause).
#    - Chunk 2+: Each distinct amendment directive with its full quoted language and its precise target path
#      (e.g., “অংশ ২, দফা (১), উপ-দফা (ঘ) এর পর … সন্নিবেশিত হইবে …”).
#      Do not break a directive across chunks.
#    - Final chunk (or merged if brief): conditions/compliance + effectivity/validity + sign-off + printing/publishing info.
# F. If multiple directives exist, prefer **one directive per chunk** when feasible; keep consecutive sub-directives together
#    only if very short and tightly related.

# === METADATA REQUIREMENTS (flat; Pinecone-safe) ===
# For every chunk, return **strings/numbers/booleans or list[str] only**. No nested objects.
# Required keys (fill with best-effort if visible; otherwise omit):
# - doc_type: "SRO"
# - sro_no: e.g., "৩৪০-আইন/আয়কর-৪৮/২০২৪"
# - ministry: e.g., "অর্থ মন্ত্রণালয়"
# - issuer: e.g., "জাতীয় রাজস্ব বোর্ড"
# - authority_act: e.g., "আয়কর আইন, ২০২৩"
# - authority_section: e.g., "ধারা ৩৪১"
# - target_instrument: e.g., "ষষ্ঠ তফসিল"
# - target_path: e.g., "অংশ ২ > দফা (১) > উপ-দফা (ঘ)"
# - amendment_action: one of ["insert","omit","repeal","replace","amend","add","substitute"]
# - amendment_unit: e.g., "উপ-দফা", "দফা", "অনুচ্ছেদ"
# - amendment_text_snippet: short excerpt (≤200 chars) from the operative text (no paraphrase)
# - compliance_refs: list of section refs as strings, e.g., ["ধারা ৭৬(৫)","ধারা ৭৬(৬)"]
# - effectivity: e.g., "অবিলম্বে কার্যকর"
# - valid_upto: normalized ISO date "YYYY-MM-DD" if a sunset date is stated; otherwise omit
# - issue_date_bn: Bangla date line as printed (string)
# - issue_date_en: "YYYY-MM-DD" if determinable
# - gazette_date_en: "YYYY-MM-DD" if determinable
# - language: "bn"
# - chunk_type: one of ["header","amendment","conditions","effectivity","signoff","footer","mixed"]
# - chunk_id: stable id you generate, e.g., "SRO-2024-340-01"
# - keywords: list[str] including salient terms
# Optional (include if visible): part, chapter, section, clause, sub_clause (as strings); website; price_taka (number).

# === OUTPUT FORMAT (STRICT) ===
# Return a SINGLE top-level JSON object exactly like:
# ```json
# {{
#   "chunks": [
#     {{
#       "content": "<full chunk text, including exact quotes and headers>",
#       "metadata": {{
#         "doc_type": "SRO",
#         "sro_no": "...",
#         "ministry": "...",
#         "issuer": "...",
#         "authority_act": "...",
#         "authority_section": "...",
#         "target_instrument": "...",
#         "target_path": "...",
#         "amendment_action": "...",
#         "amendment_unit": "...",
#         "amendment_text_snippet": "...",
#         "compliance_refs": ["...", "..."],
#         "effectivity": "...",
#         "valid_upto": "YYYY-MM-DD",
#         "issue_date_bn": "...",
#         "issue_date_en": "YYYY-MM-DD",
#         "gazette_date_en": "YYYY-MM-DD",
#         "language": "bn",
#         "chunk_type": "...",
#         "chunk_id": "SRO-YYYY-NNN-CC",
#         "keywords": ["...", "..."],
#         "part": "...",
#         "chapter": "...",
#         "section": "...",
#         "clause": "...",
#         "sub_clause": "...",
#         "website": "...",
#         "price_taka": 4
#       }}
#     }}
#   ]
# }}
# ```
# === QUALITY & VALIDATION ===
# - Do not exceed 3000 tokens per chunk.
# - Never split a single amendment directive across chunks.
# - Keep exact Bangla spellings, quotes (“ ”) and numerals as printed.
# - Normalize dates to ISO in metadata when you can infer the Gregorian date; also keep Bangla date in metadata.
# - Ensure each chunk carries sufficient header context so it is meaningful when retrieved independently.

# === NOW CHUNK THE FOLLOWING DOCUMENT ===
# """

In [None]:
#Single file chunking
chunking_system_prompt_sro = """
You are an expert in Bangladesh Gazette SROs (Statutory Regulatory Orders).
SINGLE-CHUNK MODE: Produce exactly ONE chunk that represents the ENTIRE document.
Do NOT reprint the document text. For the 'content' field, output the literal token: <<KEEP_ORIGINAL_TEXT>>.
Your job is to extract rich, flat metadata + many high-recall keywords.

SCOPE
- SROs/প্রজ্ঞাপন that may amend/insert/omit/replace ANY legal instrument or provision:
  schedules, rules, forms, parts, chapters, sections, subsections, clauses, sub-clauses, tables, notes—of ANY Act/Law/Ordinance/Rules (not only Sixth Schedule).
- Keep language-specific text (Bangla) EXACT in metadata snippets.

METADATA (flat; Pinecone-safe)
Include as many as visible (omit if unknown). Strings/numbers/booleans/list[str] only.
- doc_type: "SRO"
- sro_no
- ministry
- issuer
- authority_act            (e.g., "আয়কর আইন, ২০২৩" / "Income-tax Act, 2023")
- authority_section        (e.g., "ধারা ৩৪১" and/or "Section 341")
- target_instrument        (e.g., "ষষ্ঠ তফসিল", "Rules", "Schedule", "Form", "Part", "Chapter", "Section", etc.)
- target_path              (full path like "অংশ ২ > দফা (১) > উপ-দফা (ঘ)" OR "Part II > Clause (1) > Sub-clause (gha)")
- amendment_action         one of ["insert","omit","repeal","replace","amend","add","substitute"]
- amendment_unit           (e.g., "উপ-দফা","দফা","অনুচ্ছেদ","section","subsection","clause","sub-clause","form","table")
- amendment_text_snippet   ≤200 chars, verbatim excerpt from the operative directive (exact Bangla/English as printed)
- compliance_refs          list[str] (e.g., ["ধারা ৭৬(৫)","ধারা ৭৬(৬)","Section 76(5)"])
- effectivity              (e.g., "অবিলম্বে কার্যকর", "effective immediately")
- valid_upto               "YYYY-MM-DD" if a sunset/cutoff is stated
- issue_date_bn            Bangla date string as printed
- issue_date_en            "YYYY-MM-DD" if determinable
- gazette_date_en          "YYYY-MM-DD" if determinable
- language                 "bn" or "bn+en"
- chunk_type               one of ["header","amendment","conditions","effectivity","signoff","footer","mixed"]
- chunk_id                 e.g., "SRO-YYYY-NNN-01"
- website                  Gazette/NBR URL if printed
- price_taka               number if printed

KEYWORDS (make it BIG, high-recall)
Populate **keywords** with MANY unique, deduplicated items (aim 100–250):
- All law/act/ordinance names (bn + en if present)
- All instrument names/paths (Schedule, Rules, Form, Part, Chapter, Section numbers; e.g., "ধারা ৩৪১","Section 341","অংশ ২","Part II","দফা (১)","Clause (1)","উপ-দফা (ঘ)","Sub-clause (gha)")
- SRO numbers (full), Gazette dates (bn + en), years (e.g., "২০২৪","2024","২০২৯","2029")
- Organizations/persons printed (NBR, Ministry, Foundations/Trusts, signatory names/titles)
- Legal actions/verbs (insert/omit/repeal/replace/amend/add/substitute + Bangla equivalents)
- Tax category words (exemption, deduction, donation, allowance, conditions), domain terms present
- Both **Bangla and English** forms for numerals and headings (e.g., "ধারা ৩৪১", "Section 341"; "৩০ জুন ২০২৯", "2029-06-30")
- Obvious aliases/synonyms/transliterations appearing in the doc
Keep it a list[str]; no paraphrase—use exact tokens/phrases from the document where possible.

STRICT OUTPUT (JSON ONLY; EXACTLY ONE CHUNK)
```json
{{
  "chunks": [
    {{
      "content": "<<KEEP_ORIGINAL_TEXT>>",
      "metadata": {{
        "doc_type": "SRO",
        "sro_no": "...",
        "ministry": "...",
        "issuer": "...",
        "authority_act": "...",
        "authority_section": "...",
        "target_instrument": "...",
        "target_path": "...",
        "amendment_action": "...",
        "amendment_unit": "...",
        "amendment_text_snippet": "...",
        "compliance_refs": ["...", "..."],
        "effectivity": "...",
        "valid_upto": "YYYY-MM-DD",
        "issue_date_bn": "...",
        "issue_date_en": "YYYY-MM-DD",
        "gazette_date_en": "YYYY-MM-DD",
        "language": "bn",
        "chunk_type": "mixed",
        "chunk_id": "SRO-YYYY-NNN-01",
        "website": "...",
        "price_taka": 4,
        "keywords": ["many","many","items","ধারা ৩৪১","Section 341","অংশ ২","Part II","... up to 250+"]
      }}
    }}
  ]
}}
"""

In [13]:
single_chunk_prompt = ChatPromptTemplate.from_messages([
    ("system", chunking_system_prompt_sro),
    ("human", "Document to chunk (analyze for metadata only):\n\n{document_text}")
])

def llm_single_chunk_sro(document: Document, max_retries: int = 2) -> List[Document]:
    """
    Create exactly one chunk per document.
    The LLM returns ONLY metadata; we inject the original text as the content,
    so no character is ever lost.
    """

    chain = single_chunk_prompt | chunking_llm_sro

    last_err = None
    for attempt in range(1, max_retries + 2):
        try:
            resp = chain.invoke({"document_text": document.page_content})
            text = resp.content.strip()
            if text.startswith("```json"):
                text = text[7:]
            if text.endswith("```"):
                text = text[:-3]

            data = json.loads(text)
            items = data.get("chunks", [])
            if not items:
                raise ValueError("No 'chunks' array in response.")

            info = items[0]
            # IGNORE any 'content' from the LLM — we use the exact original text
            chunk_metadata = document.metadata.copy()
            chunk_metadata.update(info.get("metadata", {}))

            # diagnostics
            chunk_tokens = count_tokens(document.page_content)
            chunk_metadata["chunk_index"] = 0
            chunk_metadata["total_chunks"] = 1
            chunk_metadata["chunk_tokens"] = chunk_tokens

            # Build the single chunk with ORIGINAL text verbatim
            return [Document(page_content=document.page_content, metadata=chunk_metadata)]

        except Exception as e:
            last_err = e
            if attempt > max_retries:
                # Hard fallback: still produce one chunk with minimal metadata
                fallback_md = document.metadata.copy()
                fallback_md.update({
                    "doc_type": "SRO",
                    "chunk_type": "mixed",
                    "chunk_id": f"SRO-FALLBACK-01",
                    "chunk_index": 0,
                    "total_chunks": 1,
                    "chunk_tokens": count_tokens(document.page_content)
                })
                return [Document(page_content=document.page_content, metadata=fallback_md)]

    # Should never get here
    raise last_err

def process_all_documents_single_chunk(documents: List[Document]) -> List[Document]:
    """
    Runs single-chunk SRO processing across all docs.
    """
    all_chunks = []
    print(f"\n🚀 Single-chunk SRO mode for {len(documents)} document(s)")
    for i, doc in enumerate(documents, 1):
        print(f"--- Doc {i}/{len(documents)}: {doc.metadata.get('source','(unknown)')}")
        chunks = llm_single_chunk_sro(doc)
        print(f"  ✅ Created 1 chunk ({chunks[0].metadata.get('chunk_tokens', 0)} tokens)")
        all_chunks.extend(chunks)
    return all_chunks

In [14]:
# ---------- Run Chunking Over All Files (SRO only) ----------
all_chunks: List[Document] = []
print("\n🚀 SRO-only chunking run…")
for i, d in enumerate(documents):
    print(f"\n--- File {i+1}/{len(documents)} ---")
    toks = count_tokens(d.page_content)
    print(f"Tokens: {toks}")
    if toks < 80:
        print("  ⚠️ Very small file → skipping")
        continue
    all_chunks.extend(llm_single_chunk_sro(d))

print(f"\n✅ Chunking complete. Total SRO chunks: {len(all_chunks)}")
if all_chunks:
    print("Sample metadata:", {k: all_chunks[0].metadata.get(k) for k in list(all_chunks[0].metadata.keys())[:10]})


🚀 SRO-only chunking run…

--- File 1/3 ---
Tokens: 8460

--- File 2/3 ---
Tokens: 1192

--- File 3/3 ---
Tokens: 1147

✅ Chunking complete. Total SRO chunks: 3
Sample metadata: {'source': 'SRO\\05. SRO - Amendment of Sixth Schedule (09 October 2024)_complete_transcription.txt', 'doc_type': 'SRO', 'sro_no': '৩৪০-আইন/আয়কর-৪৮/২০২৪', 'ministry': 'অর্থ মন্ত্রণালয়', 'issuer': 'জাতীয় রাজস্ব বোর্ড', 'authority_act': 'আয়কর আইন, ২০২৩', 'authority_section': 'ধারা ৩৪১', 'target_instrument': 'ষষ্ঠ তফসিল', 'target_path': 'অংশ ২ > দফা (১) > উপ-দফা (গ) পরবর্তী', 'amendment_action': 'insert'}


In [15]:
print(len(all_chunks))

3


In [17]:
for i in range(len(all_chunks)):
    print("===============CHUNK===============",i)
    print(all_chunks[i])


পৃষ্ঠা/Page 1

-------------------------------------------------

বাংলাদেশ

গেজেট

অতিরিক্ত সংখ্যা

কর্তৃপক্ষ কর্তৃক প্রকাশিত

বৃহস্পতিবার, অক্টোবর ১০, ২০২৪

গণপ্রজাতন্ত্রী বাংলাদেশ সরকার

অর্থ মন্ত্রণালয়

জাতীয় রাজস্ব বোর্ড

(আয়কর)

প্রজ্ঞাপন

তারিখ: ২৪ আশ্বিন, ১৪৩১ বঙ্গাব্দ/৯ অক্টোবর, ২০২৪ খ্রিষ্টাব্দ

এস. আর. ও. নং ৩৪০-আইন/আয়কর-৪৮/২০২৪।—জাতীয় রাজস্ব বোর্ড, আয়কর আইন, ২০২৩ (২০২৩ সনের ১২ নং আইন) এর ধারা ৩৪১ এ প্রদত্ত ক্ষমতাবলে, উক্ত আইনের ষষ্ঠ তফসিলের নিম্নরূপ অধিকতর সংশোধন করিল, যথা:—

উক্ত তফসিলের অংশ ২ এর দফা (১) এর—

(ক) উপ-দফা (খ) এর প্রান্তস্থিত “এবং” শব্দটি বিলুপ্ত হইবে ;

(খ) উপ-দফা (গ) এর পর নিম্নরূপ নূতন উপ-দফা (ঘ) সন্নিবেশিত হইবে, যথা:—

“(ঘ) আস

সুন্নাহ ফাউন্ডেশনে দানকৃত আয়;”।

২। এই প্রজ্ঞাপনের আওতায় কর অব্যাহতিপ্রাপ্ত করদাতাগণ কর্তৃক উক্ত আইনের ধারা ৭৬ এর উপ-ধারা (৫) ও (৬) এর বিধানাবলি পরিপালন করিতে হইবে।

৩। এই প্রজ্ঞাপন অবিলম্বে কার্যকর হইবে এবং ইহা ৩০ জুন, ২০২৯ তারিখ পর্যন্ত বলবৎ থাকিবে।

জাতীয় রাজস্ব বোর্ডের আদেশক্রমে

মোঃ আবদুর রহমান খান এফসিএমএ সচিব অভ্যন্

In [18]:
# ==========================================
# STEP 4: Setup Pinecone
# ==========================================

# Set Pinecone API key
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Check embedding dimension
test_embedding = embeddings.embed_query("test")
actual_dimension = len(test_embedding)
print(f"Actual embedding dimension: {actual_dimension}")

# Index settings
index_name = "sro-agentic-chunking"
embedding_dimension = 3072  # text-embedding-3-large dimension

# Create index if it doesn't exist
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Created new index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

# Create vectorstore
vectorstore = PineconeVectorStore(
    index=pc.Index(index_name),
    embedding=embeddings
)

Actual embedding dimension: 3072
Created new index: sro-agentic-chunking


In [19]:
# ==========================================
# STEP 5: Add Chunks to Vectorstore (FIXED)
# ==========================================

def sanitize_metadata_for_pinecone(metadata: dict) -> dict:
    """
    Sanitize metadata to comply with Pinecone requirements:
    - No null/None values
    - Only strings, numbers, booleans, or lists of strings
    """
    sanitized = {}
    
    for key, value in metadata.items():
        if value is None:
            # Skip null/None values entirely
            continue
        elif isinstance(value, str):
            # Keep non-empty strings
            if value.strip():
                sanitized[key] = value.strip()
        elif isinstance(value, (int, float, bool)):
            # Keep numbers and booleans
            sanitized[key] = value
        elif isinstance(value, list):
            # Clean lists - only keep non-empty strings
            clean_list = [str(item).strip() for item in value if item is not None and str(item).strip()]
            if clean_list:
                sanitized[key] = clean_list
        elif isinstance(value, dict):
            # Skip complex nested objects
            continue
        else:
            # Convert other types to strings
            str_value = str(value).strip()
            if str_value and str_value.lower() not in ['none', 'null', '']:
                sanitized[key] = str_value
    
    # Ensure we have at least basic metadata
    if 'source' not in sanitized:
        sanitized['source'] = 'unknown'
    if 'chunk_type' not in sanitized:
        sanitized['chunk_type'] = 'general'
    
    return sanitized

def add_chunks_to_vectorstore_fixed(vectorstore, all_chunks, max_tokens_per_batch=200000):
    """Add LLM-chunked documents to vectorstore - NEVER SKIP ANY CHUNKS"""
    
    if not all_chunks:
        print("No all_chunks to add!")
        return
    
    print(f"📤 Adding {len(all_chunks)} LLM-generated all_chunks to vectorstore...")
    print("🧹 Sanitizing metadata for Pinecone compatibility...")
    print("🔒 ZERO LOSS POLICY: Every chunk will be uploaded with fixed metadata")
    
    # Pre-process all all_chunks to sanitize metadata - NEVER SKIP
    sanitized_chunks = []
    
    for i, chunk in enumerate(all_chunks):
        try:
            # Sanitize metadata - replace nulls with defaults
            clean_metadata = sanitize_metadata_for_pinecone(chunk.metadata)
            
            # Ensure content exists
            content = chunk.page_content if chunk.page_content else "Content not available"
            
            # Create new Document with clean metadata
            clean_chunk = Document(
                page_content=content,
                metadata=clean_metadata
            )
            sanitized_chunks.append(clean_chunk)
            
        except Exception as e:
            print(f"  ⚠️  Metadata error for chunk {i+1}: {e}")
            
            # NEVER SKIP - create chunk with minimal safe metadata
            fallback_metadata = {
                'source': f'chunk_{i+1}',
                'chunk_type': 'general',
                'act_name': 'Not specified',
                'section_range': 'Not specified',
                'chunk_index': i,
                'chunk_tokens': count_tokens(chunk.page_content) if chunk.page_content else 0
            }
            
            fallback_chunk = Document(
                page_content=chunk.page_content if chunk.page_content else "Content not available",
                metadata=fallback_metadata
            )
            sanitized_chunks.append(fallback_chunk)
            print(f"  🔧 Applied fallback metadata for chunk {i+1}")
    
    print(f"  ✅ Prepared {len(sanitized_chunks)} all_chunks for upload (same as input: {len(all_chunks)})")
    
    # Verify we haven't lost any all_chunks
    if len(sanitized_chunks) != len(all_chunks):
        raise Exception(f"CRITICAL ERROR: Chunk count mismatch! Input: {len(all_chunks)}, Output: {len(sanitized_chunks)}")
    
    # Now proceed with batch upload - with aggressive retry logic
    current_batch = []
    current_tokens = 0
    batch_num = 1
    successful_uploads = 0
    
    for i, chunk in enumerate(sanitized_chunks):
        chunk_tokens = chunk.metadata.get("chunk_tokens", count_tokens(chunk.page_content))
        
        # Check if adding this chunk would exceed the limit
        if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:
            # Process current batch
            print(f"Processing batch {batch_num}: {len(current_batch)} all_chunks, {current_tokens} tokens")
            
            success = upload_batch_with_retry(vectorstore, current_batch, batch_num)
            successful_uploads += success
            
            # Reset for next batch
            current_batch = []
            current_tokens = 0
            batch_num += 1
        
        # Add chunk to current batch
        current_batch.append(chunk)
        current_tokens += chunk_tokens
        
        if (i + 1) % 20 == 0:
            print(f"  📊 Processed {i + 1}/{len(sanitized_chunks)} all_chunks...")
    
    # Process final batch
    if current_batch:
        print(f"Processing final batch {batch_num}: {len(current_batch)} all_chunks, {current_tokens} tokens")
        success = upload_batch_with_retry(vectorstore, current_batch, batch_num)
        successful_uploads += success
    
    print(f"🎉 Upload complete! Successfully added {successful_uploads}/{len(all_chunks)} all_chunks to vectorstore!")
    
    if successful_uploads != len(all_chunks):
        raise Exception(f"CRITICAL ERROR: Not all chunks uploaded! Expected: {len(all_chunks)}, Uploaded: {successful_uploads}")

def upload_batch_with_retry(vectorstore, batch, batch_num):
    """Upload batch with aggressive retry - ensure every chunk gets uploaded"""
    
    try:
        vectorstore.add_documents(batch)
        print(f"  ✅ Batch {batch_num} successful ({len(batch)} all_chunks)")
        return len(batch)
        
    except Exception as e:
        print(f"  ❌ Batch {batch_num} failed: {e}")
        print(f"  🔄 Switching to individual upload mode for {len(batch)} all_chunks...")
        
        successful_individual = 0
        
        for j, single_chunk in enumerate(batch):
            try:
                vectorstore.add_documents([single_chunk])
                successful_individual += 1
                
            except Exception as single_error:
                print(f"    ❌ Individual chunk {j+1} failed: {single_error}")
                
                # Last resort - strip metadata to absolute minimum
                try:
                    minimal_chunk = Document(
                        page_content=single_chunk.page_content,
                        metadata={
                            'source': f'emergency_chunk_{batch_num}_{j}',
                            'chunk_type': 'general'
                        }
                    )
                    vectorstore.add_documents([minimal_chunk])
                    successful_individual += 1
                    print(f"    🆘 Emergency upload successful for chunk {j+1}")
                    
                except Exception as emergency_error:
                    print(f"    💥 CRITICAL: Cannot upload chunk {j+1} even with minimal metadata: {emergency_error}")
                    print(f"    📝 Content preview: {single_chunk.page_content[:100]}...")
                    # This should never happen, but we log it for investigation
        
        print(f"  📊 Individual upload result: {successful_individual}/{len(batch)} all_chunks")
        return successful_individual

# Debug function to check your current all_chunks
def debug_chunk_metadata(all_chunks, num_samples=5):
    """Debug function to inspect chunk metadata"""
    print(f"🔍 Debugging metadata for {min(num_samples, len(all_chunks))} sample all_chunks:")
    
    for i, chunk in enumerate(all_chunks[:num_samples]):
        print(f"\nChunk {i+1} metadata:")
        for key, value in chunk.metadata.items():
            value_type = type(value).__name__
            print(f"  {key}: {value} (type: {value_type})")
            
            if value is None:
                print(f"    ❌ NULL VALUE DETECTED in '{key}' - this will cause Pinecone error!")


In [20]:
# Run this first to see what's wrong
print("🔍 Checking your chunks for metadata issues...")
debug_chunk_metadata(all_chunks)

🔍 Checking your chunks for metadata issues...
🔍 Debugging metadata for 3 sample all_chunks:

Chunk 1 metadata:
  source: SRO\05. SRO - Amendment of Sixth Schedule (09 October 2024)_complete_transcription.txt (type: str)
  doc_type: SRO (type: str)
  sro_no: ৩৪০-আইন/আয়কর-৪৮/২০২৪ (type: str)
  ministry: অর্থ মন্ত্রণালয় (type: str)
  issuer: জাতীয় রাজস্ব বোর্ড (type: str)
  authority_act: আয়কর আইন, ২০২৩ (type: str)
  authority_section: ধারা ৩৪১ (type: str)
  target_instrument: ষষ্ঠ তফসিল (type: str)
  target_path: অংশ ২ > দফা (১) > উপ-দফা (গ) পরবর্তী (type: str)
  amendment_action: insert (type: str)
  amendment_unit: উপ-দফা (type: str)
  amendment_text_snippet: উপ-দফা (গ) এর পর নিম্নরূপ নূতন উপ-দফা (ঘ) সন্নিবেশিত হইবে, যথা:— “(ঘ) আস-সুন্নাহ ফাউন্ডেশনে দানকৃত আয়;”। (type: str)
  compliance_refs: ['ধারা ৭৬(৫)', 'ধারা ৭৬(৬)'] (type: list)
  effectivity: অবিলম্বে কার্যকর (type: str)
  valid_upto: 2029-06-30 (type: str)
  issue_date_bn: ২৪ আশ্বিন, ১৪৩১ বঙ্গাব্দ/৯ অক্টোবর, ২০২৪ খ্রিষ্টাব্

In [23]:
# Then use the fixed function
add_chunks_to_vectorstore_fixed(vectorstore, all_chunks)

📤 Adding 3 LLM-generated all_chunks to vectorstore...
🧹 Sanitizing metadata for Pinecone compatibility...
🔒 ZERO LOSS POLICY: Every chunk will be uploaded with fixed metadata
  ✅ Prepared 3 all_chunks for upload (same as input: 3)
Processing final batch 1: 3 all_chunks, 10799 tokens
  ✅ Batch 1 successful (3 all_chunks)
🎉 Upload complete! Successfully added 3/3 all_chunks to vectorstore!
