In [None]:
!pip install docling

In [None]:
import os
from pathlib import Path
from docling.document_converter import DocumentConverter

# 1. Setup paths
input_dir = Path("/content/reports")
output_dir = Path("extracted_md")
output_dir.mkdir(exist_ok=True)

# 2. Initialize Docling Converter
converter = DocumentConverter()

def extract_pdfs():
    pdf_files = list(input_dir.glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return

    print(f"Found {len(pdf_files)} files. Starting extraction...")

    for pdf_path in pdf_files:
        try:
            print(f"Processing: {pdf_path.name}...")

            # Convert PDF to Docling's internal format
            result = converter.convert(pdf_path)

            # Export to Markdown (best for your chunking strategy)
            md_output = result.document.export_to_markdown()

            # Save the file
            output_filename = pdf_path.stem + ".md"
            with open(output_dir / output_filename, "w", encoding="utf-8") as f:
                f.write(md_output)

        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")

    print(f"\nSuccess! Extracted files are in: {output_dir}")

extract_pdfs()

In [1]:
!pip install pyyaml



In [2]:
import re
import json
import hashlib
import yaml
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Generator, Optional
import tiktoken

@dataclass
class Chunk:
    """Represents a single document chunk with metadata."""
    id: str
    content: str
    source_file: str
    section_path: list[str]
    token_count: int
    has_table: bool
    # New field: Inherits doc_id, version, date, etc.
    metadata: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return asdict(self)

class MarkdownChunker:
    """Chunks markdown documents based on structural elements and preserves metadata."""

    # Pattern to separate YAML frontmatter from Markdown body
    FRONTMATTER_PATTERN = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL)
    HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
    TABLE_PATTERN = re.compile(r'^\|.+\|(?:\n^\|.+\|$)+', re.MULTILINE)

    def __init__(self, max_tokens: int = 1000, model: str = "cl100k_base"):
        self.max_tokens = max_tokens
        self.tokenizer = tiktoken.get_encoding(model)

    def count_tokens(self, text: str) -> int:
        return len(self.tokenizer.encode(text))

    def generate_chunk_id(self, source: str, content: str, index: int) -> str:
        # Including index ensures unique IDs even for identical content
        hash_input = f"{source}:{index}:{content[:100]}"
        return hashlib.sha256(hash_input.encode()).hexdigest()[:16]

    def _parse_frontmatter(self, text: str) -> tuple[dict, str]:
        """Extracts YAML frontmatter and returns (metadata_dict, remaining_content)."""
        match = self.FRONTMATTER_PATTERN.match(text)
        if match:
            yaml_content = match.group(1)
            try:
                metadata = yaml.safe_load(yaml_content)
                # Return metadata and the text strictly AFTER the --- block
                return metadata, text[match.end():]
            except yaml.YAMLError:
                # Fallback if YAML is malformed
                return {}, text
        return {}, text

    def parse_sections(self, content: str) -> list[dict]:
        lines = content.split('\n')
        sections = []
        current_section = {'level': 0, 'title': '', 'content_lines': [], 'has_table': False}
        i = 0
        while i < len(lines):
            line = lines[i]
            header_match = self.HEADER_PATTERN.match(line)
            if header_match:
                # Save previous section
                if current_section['content_lines'] or current_section['title']:
                    section_content = '\n'.join(current_section['content_lines'])
                    current_section['has_table'] = bool(self.TABLE_PATTERN.search(section_content))
                    sections.append({
                        'level': current_section['level'],
                        'title': current_section['title'],
                        'content': section_content.strip(),
                        'has_table': current_section['has_table']
                    })
                # Start new section
                level = len(header_match.group(1))
                title = header_match.group(2).strip()
                current_section = {'level': level, 'title': title, 'content_lines': [], 'has_table': False}
            else:
                current_section['content_lines'].append(line)
            i += 1

        # Capture the last section
        if current_section['content_lines'] or current_section['title']:
            section_content = '\n'.join(current_section['content_lines'])
            current_section['has_table'] = bool(self.TABLE_PATTERN.search(section_content))
            sections.append({
                'level': current_section['level'],
                'title': current_section['title'],
                'content': section_content.strip(),
                'has_table': current_section['has_table']
            })
        return sections

    def build_section_path(self, sections: list[dict], current_idx: int) -> list[str]:
        current = sections[current_idx]
        path = []
        if current['title']: path.append(current['title'])
        current_level = current['level']
        for i in range(current_idx - 1, -1, -1):
            section = sections[i]
            if section['level'] < current_level and section['title']:
                path.insert(0, section['title'])
                current_level = section['level']
                if current_level == 1: break
        return path

    def _build_header_prefix(self, section_path: list[str]) -> str:
        if not section_path: return ""
        lines = []
        for i, title in enumerate(section_path):
            lines.append('#' * (i + 1) + ' ' + title)
        return '\n'.join(lines) + '\n\n'

    def split_large_section(self, text: str, section_path: list[str], source_file: str, start_index: int, doc_metadata: dict) -> Generator[Chunk, None, None]:
        header_prefix = self._build_header_prefix(section_path)
        header_tokens = self.count_tokens(header_prefix)
        available_tokens = self.max_tokens - header_tokens
        paragraphs = re.split(r'\n\n+', text)
        current_chunk_parts = []
        current_tokens = 0
        chunk_index = start_index

        for para in paragraphs:
            para = para.strip()
            if not para: continue
            para_tokens = self.count_tokens(para)
            is_table = bool(self.TABLE_PATTERN.search(para))

            # Case 1: Table is too big or forces a break
            if is_table and para_tokens > available_tokens:
                if current_chunk_parts:
                    content = header_prefix + '\n\n'.join(current_chunk_parts)
                    yield Chunk(
                        self.generate_chunk_id(source_file, content, chunk_index),
                        content, source_file, section_path, self.count_tokens(content), False,
                        doc_metadata
                    )
                    chunk_index += 1; current_chunk_parts = []; current_tokens = 0

                content = header_prefix + para
                yield Chunk(
                    self.generate_chunk_id(source_file, content, chunk_index),
                    content, source_file, section_path, self.count_tokens(content), True,
                    doc_metadata
                )
                chunk_index += 1
                continue

            # Case 2: Paragraph overflows current chunk
            if current_tokens + para_tokens > available_tokens:
                if current_chunk_parts:
                    chunk_text = '\n\n'.join(current_chunk_parts)
                    content = header_prefix + chunk_text
                    yield Chunk(
                        self.generate_chunk_id(source_file, content, chunk_index),
                        content, source_file, section_path, self.count_tokens(content),
                        bool(self.TABLE_PATTERN.search(chunk_text)),
                        doc_metadata
                    )
                    chunk_index += 1; current_chunk_parts = []; current_tokens = 0

            current_chunk_parts.append(para)
            current_tokens += para_tokens

        # Yield remaining parts
        if current_chunk_parts:
            chunk_text = '\n\n'.join(current_chunk_parts)
            content = header_prefix + chunk_text
            yield Chunk(
                self.generate_chunk_id(source_file, content, chunk_index),
                content, source_file, section_path, self.count_tokens(content),
                bool(self.TABLE_PATTERN.search(chunk_text)),
                doc_metadata
            )

    def chunk_document(self, content: str, source_file: str) -> Generator[Chunk, None, None]:
        # 1. Extract Metadata first
        doc_metadata, body_content = self._parse_frontmatter(content)

        # 2. Parse sections from the body only
        sections = self.parse_sections(body_content)
        chunk_index = 0

        for i, section in enumerate(sections):
            section_path = self.build_section_path(sections, i)
            header = ('#' * section['level'] + ' ' + section['title']) if section['title'] else ''

            # Combine header and content
            full_text = (header + '\n\n' + section['content']).strip() if header else section['content'].strip()
            if not full_text: continue

            token_count = self.count_tokens(full_text)

            # 3. Create Chunks with Metadata
            if token_count <= self.max_tokens:
                yield Chunk(
                    self.generate_chunk_id(source_file, full_text, chunk_index),
                    full_text, source_file, section_path, token_count, section['has_table'],
                    doc_metadata # Injected here
                )
                chunk_index += 1
            else:
                # Pass metadata into the splitter
                for chunk in self.split_large_section(section['content'], section_path, source_file, chunk_index, doc_metadata):
                    yield chunk
                    chunk_index += 1

In [None]:
!pip install chromadb sentence-transformers rank_bm25

In [4]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import numpy as np
import re
import datetime

class HybridIndex:
    def __init__(self, collection_name="rag_conflict_resolution"):
        # 1. Initialize Dense Model (BGE-M3)
        print("Loading BGE-M3 model... (this may take a moment)")
        self.dense_model = SentenceTransformer('BAAI/bge-m3')

        # 2. Initialize ChromaDB (Vector Store)
        # We explicitly reset the client to ensure clean state if re-running
        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))

        # Delete if exists to avoid "doubling up" data during debugging
        try:
            self.chroma_client.delete_collection(collection_name)
        except:
            pass

        self.collection = self.chroma_client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )

        # 3. Initialize Sparse Storage (In-memory for BM25)
        self.bm25 = None
        self.chunk_map = {}
        self.tokenized_corpus = []

    def _tokenize(self, text: str):
        """Simple regex tokenizer for BM25."""
        return re.findall(r'\w+', text.lower())

    def _sanitize_metadata(self, metadata: dict) -> dict:
        """Converts complex types (dates) in metadata to strings for ChromaDB."""
        clean_meta = {}
        for k, v in metadata.items():
            # Convert date/datetime objects to ISO format strings
            if isinstance(v, (datetime.date, datetime.datetime)):
                clean_meta[k] = v.isoformat()
            # Pass through valid types
            elif isinstance(v, (str, int, float, bool, type(None))):
                clean_meta[k] = v
            # Fallback: Stringify anything else (like nested lists)
            else:
                clean_meta[k] = str(v)
        return clean_meta

    def add_documents(self, chunks: list):
        ids = [chunk.id for chunk in chunks]
        texts = [chunk.content for chunk in chunks]

        # --- FIX APPLIED HERE: SANITIZE METADATA ---
        metadatas = [self._sanitize_metadata(chunk.metadata) for chunk in chunks]

        # --- DENSE PIPELINE (BGE-M3) ---
        print(f"Generating dense embeddings for {len(chunks)} chunks...")
        embeddings = self.dense_model.encode(texts, convert_to_numpy=True)

        # Add to ChromaDB
        self.collection.add(
            documents=texts,
            embeddings=embeddings.tolist(),
            metadatas=metadatas,
            ids=ids
        )

        # --- SPARSE PIPELINE (BM25) ---
        print("Building BM25 index...")
        new_tokens = [self._tokenize(text) for text in texts]
        self.tokenized_corpus.extend(new_tokens)

        start_idx = len(self.chunk_map)
        for i, chunk_id in enumerate(ids):
            self.chunk_map[start_idx + i] = chunk_id

        self.bm25 = BM25Okapi(self.tokenized_corpus)

        print(f"Indexing complete. {len(ids)} documents added.")

    def hybrid_search(self, query: str, top_k: int = 5):
        # 1. Dense Search (Chroma)
        query_embedding = self.dense_model.encode([query]).tolist()
        dense_results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k
        )

        # 2. Sparse Search (BM25)
        tokenized_query = self._tokenize(query)
        sparse_scores = self.bm25.get_scores(tokenized_query)
        top_n_sparse = np.argsort(sparse_scores)[::-1][:top_k]

        print("--- Dense Results ---")
        for i, doc in enumerate(dense_results['documents'][0]):
            print(f"Doc: {doc[:50]}... | ID: {dense_results['ids'][0][i]}")

        print("\n--- Sparse Results (BM25) ---")
        for idx in top_n_sparse:
            chunk_id = self.chunk_map.get(idx)
            print(f"Score: {sparse_scores[idx]:.4f} | ID: {chunk_id}")

        return dense_results, top_n_sparse

In [5]:
import os
from pathlib import Path

def run_indexing_pipeline(input_dir: str = "dataset"):
    # 1. Initialize Components
    # We use a smaller chunk size (500) to create more granular conflict targets
    chunker = MarkdownChunker(max_tokens=500)

    # Initialize the Hybrid Index (BGE-M3 + BM25)
    # Note: If you run this cell twice, it will add duplicate data unless you restart runtime
    # or add a line to delete the collection first.
    indexer = HybridIndex(collection_name="rag_conflict_resolution")

    input_path = Path(input_dir)
    all_chunks = []

    print(f"🚀 Starting Indexing Pipeline reading from: {input_path}")

    # 2. Process Each File
    for md_file in input_path.glob("*.md"):
        print(f"Processing {md_file.name}...")

        try:
            with open(md_file, "r", encoding="utf-8") as f:
                content = f.read()

            # Generate chunks for this document
            # The chunker handles frontmatter extraction automatically now
            file_chunks = list(chunker.chunk_document(content, md_file.name))

            # Verify metadata extraction (sanity check)
            if file_chunks and not file_chunks[0].metadata:
                 print(f"   ⚠️ Warning: No metadata found for {md_file.name}")
            elif file_chunks:
                 print(f"   ✅ Extracted {len(file_chunks)} chunks. Metadata: {file_chunks[0].metadata.get('doc_id', 'Unknown')}")

            all_chunks.extend(file_chunks)

        except Exception as e:
            print(f"   ❌ Error processing {md_file.name}: {e}")

    # 3. Batch Indexing
    if all_chunks:
        print(f"\n📦 Indexing {len(all_chunks)} total chunks into Hybrid Store...")
        indexer.add_documents(all_chunks)
        print("🎉 Indexing Complete!")
    else:
        print("❌ No chunks were generated. Check your input directory.")

    return indexer

# --- EXECUTE PIPELINE ---
# This runs the whole show and returns the active indexer object
rag_index = run_indexing_pipeline()

Loading BGE-M3 model... (this may take a moment)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

🚀 Starting Indexing Pipeline reading from: dataset
Processing Bihar_Budget_Analysis_2025-26.md...
   ✅ Extracted 26 chunks. Metadata: budgetReport25-26
Processing State Budget Analysis - Bihar 2019-20 English.md...
   ✅ Extracted 26 chunks. Metadata: budgetReport19-20
Processing Bihar_Budget_Analysis_2024-25.md...
   ✅ Extracted 26 chunks. Metadata: budgetReport24-25
Processing Bihar_Budget_Analysis_2023-24.md...
   ✅ Extracted 26 chunks. Metadata: budgetReport23-24
Processing Bihar Budget Analysis 2022-23.md...
   ✅ Extracted 25 chunks. Metadata: budgetReport22-23
Processing State Budget Analysis - Bihar 2020-21.md...
   ✅ Extracted 30 chunks. Metadata: budgetReport20-21
Processing Bihar Budget Analysis 2016-17.md...
   ✅ Extracted 12 chunks. Metadata: budgetReport16-17
Processing Bihar Budget Analysis 2017-18.md...
   ✅ Extracted 20 chunks. Metadata: budgetReport17-18
Processing State_Budget_Analysis_2021-22-Bihar.md...
   ✅ Extracted 32 chunks. Metadata: budgetReport21-22

📦 Indexin

In [6]:
# Test Query
test_query = "What was the finance minister in 2020?"

print(f"🔎 Testing Query: '{test_query}'\n")
dense_results, sparse_indices = rag_index.hybrid_search(test_query, top_k=3)

# The output above will show you:
# 1. The retrieved text
# 2. The IDs (which you can trace back to specific chunks)
# 3. Dense vs Sparse scores

🔎 Testing Query: 'What was the finance minister in 2020?'

--- Dense Results ---
Doc: ## Bihar Budget Analysis 2020-21

The Finance Mini... | ID: 89ce7923510f0204
Doc: ## Bihar Budget Analysis 2019-20

The Finance Mini... | ID: 518cf4af85581b05
Doc: ## Bihar Budget Analysis 2021-22

The Finance Mini... | ID: ea5f578cd96bf4ef

--- Sparse Results (BM25) ---
Score: 13.1888 | ID: ea5f578cd96bf4ef
Score: 11.0754 | ID: 89ce7923510f0204
Score: 9.9880 | ID: 9c8a3426125fa63a


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class ConflictEntryGate:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

        # Pairs of keywords that suggest conflict
        self.polarity_pairs = [
            ({"allowed", "may", "can"}, {"prohibited", "must not", "cannot", "banned"}),
            ({"required", "must", "shall"}, {"optional", "suggested", "recommended"}),
            ({"increase", "rise", "growth"}, {"decrease", "fall", "decline", "reduction"}),
            ({"surplus", "profit"}, {"deficit", "loss"})
        ]

    def _check_polarity(self, texts: list[str]) -> bool:
        """Checks if opposing keywords exist across the retrieved set."""
        combined_text = " ".join(texts).lower()

        for set_a, set_b in self.polarity_pairs:
            has_a = any(w in combined_text for w in set_a)
            has_b = any(w in combined_text for w in set_b)
            if has_a and has_b:
                return True
        return False

    def _check_semantic_overlap(self, chunks: list, threshold: float = 0.75) -> bool:
        """
        Calculates pairwise similarity. Returns True if two chunks from
        DIFFERENT documents have high semantic similarity.
        """
        if len(chunks) < 2:
            return False

        # Re-embed top-k chunks to calculate pairwise similarity matrix
        # (Fast for small k)
        texts = [c['content'] for c in chunks]
        embeddings = self.embedding_model.encode(texts)
        sim_matrix = cosine_similarity(embeddings)

        # Check upper triangle of matrix
        for i in range(len(chunks)):
            for j in range(i + 1, len(chunks)):
                sim_score = sim_matrix[i][j]

                # If highly similar...
                if sim_score > threshold:
                    doc_a = chunks[i]['metadata'].get('doc_id')
                    doc_b = chunks[j]['metadata'].get('doc_id')

                    # ...and from different documents
                    if doc_a != doc_b:
                        return True
        return False

    def evaluate(self, retrieved_chunks: list) -> dict:
        """
        Main Gate Logic.
        Input: list of dicts {'content': str, 'metadata': dict}
        """
        result = {
            "trigger_conflict_pipeline": False,
            "reasons": []
        }

        # Guardrail 1: Empty results
        if not retrieved_chunks:
            return result

        # Guardrail 2: Document Diversity (Must have >1 source to conflict)
        doc_ids = {c['metadata'].get('doc_id') for c in retrieved_chunks}
        if len(doc_ids) < 2:
            return result # Normal RAG path

        result['reasons'].append(f"Multiple sources detected: {doc_ids}")

        # Signal 1: Polarity Keywords
        texts = [c['content'] for c in retrieved_chunks]
        if self._check_polarity(texts):
            result['trigger_conflict_pipeline'] = True
            result['reasons'].append("Opposing polarity keywords detected")

        # Signal 2: High Semantic Overlap between documents
        # (Only run if not already triggered, or run anyway to add evidence)
        if self._check_semantic_overlap(retrieved_chunks):
            result['trigger_conflict_pipeline'] = True
            result['reasons'].append("High semantic overlap between different documents")

        return result

In [8]:
# 1. Instantiate the Gate
gate = ConflictEntryGate(rag_index.dense_model)

# 2. Helper function to format search results for the Gate
def get_chunks_from_search(dense_res, sparse_indices):
    """Merges and formats results into a clean list of dicts."""
    chunks = []

    # Process Dense Results
    # Note: Chroma returns lists of lists
    if dense_res['documents']:
        for i, text in enumerate(dense_res['documents'][0]):
            chunks.append({
                'content': text,
                'metadata': dense_res['metadatas'][0][i],
                'source_type': 'dense'
            })

    # Process Sparse Results (Retrieve from Indexer memory)
    # We limit to top 3 sparse to avoid noise
    for idx in sparse_indices[:3]:
        chunk_id = rag_index.chunk_map.get(idx)
        # Find the chunk object (This is a bit inefficient, in prod we'd use a doc store)
        # For this prototype, we'll skip adding sparse TEXT if not easily available,
        # but usually you'd query the DB by ID.
        # Let's rely on Dense results for the Gate test for now.
        pass

    return chunks

# 3. Run a Test Case
print("--- TEST: Conflict Gate ---")
query = "What is the fiscal deficit limit?"

# Perform Search
dense, sparse = rag_index.hybrid_search(query, top_k=5)
formatted_chunks = get_chunks_from_search(dense, sparse)

# Check Gate
decision = gate.evaluate(formatted_chunks)

print("\n--- GATE DECISION ---")
print(f"Trigger Conflict Pipeline: {decision['trigger_conflict_pipeline']}")
print(f"Reasons: {decision['reasons']}")

--- TEST: Conflict Gate ---
--- Dense Results ---
Doc: # Deficits, Debts and FRBM Targets for 2017-18

Th... | ID: aedf89e5d3586f1a
Doc: # Deficits and Debt Targets for 2024-25

The Bihar... | ID: 57cec7748e55d98d
Doc: # Deficits, Debts and FRBM Targets for 20 19-20

T... | ID: ac029d0e2a90267c
Doc: ## Deficits, Debts and FRBM Targets for 2020-21

T... | ID: f5db1ab57b111635
Doc: # Deficits, Debts and FRBM Targets for 2016-17

- ... | ID: 65954d56ef01534b

--- Sparse Results (BM25) ---
Score: 11.1579 | ID: 43db0bda5149eff1
Score: 11.0540 | ID: bf339162c892f15c
Score: 11.0455 | ID: bb202ea0741a6e3c
Score: 10.9454 | ID: aedf89e5d3586f1a
Score: 10.9237 | ID: 57cec7748e55d98d

--- GATE DECISION ---
Trigger Conflict Pipeline: True
Reasons: ["Multiple sources detected: {'budgetReport20-21', 'budgetReport19-20', 'budgetReport16-17', 'budgetReport17-18', 'budgetReport24-25'}", 'Opposing polarity keywords detected', 'High semantic overlap between different documents']


In [9]:
!pip install nltk
import nltk
nltk.download('punkt_tab') # Updated for newer NLTK versions



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.tokenize import sent_tokenize
from dataclasses import dataclass

@dataclass
class ParsedClaim:
    text: str
    source: str
    date: str
    doc_id: str
    vector: np.array
    original_chunk_id: str

class ConflictDetector:
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

        # Keywords that suggest a sentence is a "Claim"
        self.claim_markers = [
            "must", "shall", "may", "cannot", "must not", "allowed", "prohibited",
            "requires", "applies", "target", "estimated", "forecast", "goal", "limit"
        ]

    def _extract_fiscal_year(self, text: str) -> str:
        """
        Extracts fiscal year patterns like '2022-23' or '2023-24'.
        Returns the first match or 'unknown'.
        """
        match = re.search(r'\b(20\d{2}-\d{2})\b', text)
        return match.group(1) if match else "unknown"

    def _extract_numeric(self, text: str):
        """
        Parses numbers and units.
        Updated to IGNORE years (integers 1990-2030) if no unit is attached.
        """
        pattern = r'(\d+(?:,\d+)*(?:\.\d+)?)\s*([%a-zA-Z]+)?'
        matches = re.findall(pattern, text)
        results = []
        for num_str, unit in matches:
            try:
                val = float(num_str.replace(',', ''))

                # FIX: Ignore likely years (integers between 1990 and 2030)
                # unless they have a specific unit (like '%')
                if 1990 <= val <= 2030 and (unit is None or unit == ''):
                    continue

                results.append({'val': val, 'unit': unit.strip().lower() if unit else None})
            except:
                continue
        return results

    def _detect_numeric_conflict(self, claims: list[ParsedClaim]) -> list:
        """Checks for value mismatches in same-unit claims."""
        conflicts = []
        for i in range(len(claims)):
            for j in range(i + 1, len(claims)):
                c1, c2 = claims[i], claims[j]

                # --- NEW FIX: Scope Check ---
                # Extract fiscal years from the text content
                year_1 = self._extract_fiscal_year(c1.text)
                year_2 = self._extract_fiscal_year(c2.text)

                # If both mention a fiscal year, and they are DIFFERENT, skip.
                # It is just different data for different times.
                if year_1 != "unknown" and year_2 != "unknown" and year_1 != year_2:
                    continue

                nums1 = self._extract_numeric(c1.text)
                nums2 = self._extract_numeric(c2.text)

                for n1 in nums1:
                    for n2 in nums2:
                        if n1['unit'] == n2['unit']:
                            if n1['val'] == 0 and n2['val'] == 0: continue
                            diff = abs(n1['val'] - n2['val'])
                            base = max(abs(n1['val']), abs(n2['val']))

                            if diff / base > 0.05: # 5% tolerance
                                conflicts.append({
                                    "type": "numeric_conflict",
                                    "description": f"Value mismatch: {n1['val']} vs {n2['val']} ({n1['unit']})",
                                    "claim_a": c1,
                                    "claim_b": c2
                                })
        return conflicts

    def _detect_temporal_conflict(self, claims: list[ParsedClaim]) -> list:
        """Same Doc ID, Different Dates/Versions -> Contradiction"""
        conflicts = []
        for i in range(len(claims)):
            for j in range(i + 1, len(claims)):
                c1, c2 = claims[i], claims[j]
                if c1.doc_id == c2.doc_id and c1.date != c2.date:
                    conflicts.append({
                        "type": "temporal_conflict",
                        "description": f"Version mismatch within {c1.doc_id}: {c1.date} vs {c2.date}",
                        "claim_a": c1,
                        "claim_b": c2
                    })
        return conflicts

    def _cluster_claims(self, claims: list[ParsedClaim], threshold=0.95):
        """
        Groups claims by semantic similarity.
        Threshold kept high (0.85) to reduce noise.
        """
        if len(claims) < 2:
            return [[c] for c in claims]

        vectors = np.array([c.vector for c in claims])
        clustering = AgglomerativeClustering(
            n_clusters=None,
            metric='cosine',
            linkage='average',
            distance_threshold=1 - threshold
        )
        labels = clustering.fit_predict(vectors)
        clusters = {}
        for idx, label in enumerate(labels):
            if label not in clusters: clusters[label] = []
            clusters[label].append(claims[idx])
        return list(clusters.values())

    def resolve_temporal_conflicts(self, conflict_report: dict, chunks: list) -> tuple[dict, list]:
        """
        Auto-resolves conflicts where one claim is strictly newer than the other.
        """
        if not conflict_report['conflict_detected']:
            return conflict_report, chunks

        chunks_to_drop = set()
        resolved_indices = []

        for i, conflict in enumerate(conflict_report['conflicts']):
            c1 = conflict['claim_1']
            c2 = conflict['claim_2']

            d1 = c1.get('date', '')
            d2 = c2.get('date', '')

            # If dates differ, prefer the newer one
            if d1 and d2 and d1 != d2:
                if d1 > d2:
                    chunks_to_drop.add(c2['original_chunk_id'])
                    resolved_indices.append(i)
                elif d2 > d1:
                    chunks_to_drop.add(c1['original_chunk_id'])
                    resolved_indices.append(i)

        # If we resolved anything, filter the output
        if resolved_indices:
            clean_chunks = [c for c in chunks if c['id'] not in chunks_to_drop]
            remaining_conflicts = [
                c for idx, c in enumerate(conflict_report['conflicts'])
                if idx not in resolved_indices
            ]
            new_report = {
                "conflict_detected": len(remaining_conflicts) > 0,
                "count": len(remaining_conflicts),
                "conflicts": remaining_conflicts,
                "resolved_count": len(resolved_indices)
            }
            return new_report, clean_chunks

        return conflict_report, chunks

    def detect_conflicts(self, retrieved_chunks: list) -> dict:
        all_claims = []

        # 1. Extract Claims
        for chunk in retrieved_chunks:
            sentences = sent_tokenize(chunk['content'])
            if not sentences: continue
            vectors = self.embedding_model.encode(sentences)

            for text, vector in zip(sentences, vectors):
                if any(marker in text.lower() for marker in self.claim_markers):
                    all_claims.append(ParsedClaim(
                        text=text,
                        source=chunk['metadata'].get('source', 'unknown'),
                        date=chunk['metadata'].get('date', 'unknown'),
                        doc_id=chunk['metadata'].get('doc_id', 'unknown'),
                        vector=vector,
                        original_chunk_id=chunk.get('id', 'unknown')
                    ))

        if not all_claims:
            return {"conflict_detected": False, "reason": "No claims extracted"}

        # 2. Cluster
        claim_clusters = self._cluster_claims(all_claims)
        detected_conflicts = []

        # 3. Type Conflicts
        for cluster_id, cluster in enumerate(claim_clusters):
            nums = self._detect_numeric_conflict(cluster)
            if nums:
                for c in nums: c['cluster_id'] = cluster_id
                detected_conflicts.extend(nums)

            temps = self._detect_temporal_conflict(cluster)
            if temps:
                for c in temps: c['cluster_id'] = cluster_id
                detected_conflicts.extend(temps)

        # 4. Deduplication
        unique_conflicts = []
        seen_signatures = set()

        for c in detected_conflicts:
            conflict_out = {
                "type": c['type'],
                "cluster_id": c['cluster_id'],
                "description": c['description'],
                "claim_1": {
                    "text": c['claim_a'].text,
                    "source": c['claim_a'].source,
                    "date": c['claim_a'].date,
                    "original_chunk_id": c['claim_a'].original_chunk_id
                },
                "claim_2": {
                    "text": c['claim_b'].text,
                    "source": c['claim_b'].source,
                    "date": c['claim_b'].date,
                    "original_chunk_id": c['claim_b'].original_chunk_id
                }
            }

            # Signature = sorted text of both claims
            sig = tuple(sorted([c['claim_a'].text, c['claim_b'].text]))

            if sig not in seen_signatures:
                seen_signatures.add(sig)
                unique_conflicts.append(conflict_out)

        if not unique_conflicts:
            return {"conflict_detected": False}

        return {
            "conflict_detected": True,
            "count": len(unique_conflicts),
            "conflicts": unique_conflicts
        }

In [11]:
!pip install -q -U google-generativeai

In [12]:
import google.generativeai as genai
import json
import os
from google.colab import userdata

class LLMNarrator:
    def __init__(self, api_key: str, model_name: str = "gemini-3-flash-preview"):
        genai.configure(api_key=api_key)
        self.model_name = model_name

        # We define the strict persona here.
        # This is the "North Star" invariant from your spec.
        self.system_instruction = """
        You are given different claims that come from retrieved document.
        Your job is to analyze those and produce an answer
        """

    def generate(self, context_prompt: str, user_content: str) -> str:
        """
        Generates the response using Gemini 1.5 Flash.
        """
        try:
            # Initialize model with the specific system instruction
            model = genai.GenerativeModel(
                model_name=self.model_name,
                system_instruction=self.system_instruction
            )

            # Combine the specific context (conflict JSON or chunks) with the user request
            full_prompt = f"{context_prompt}\n\n{user_content}"

            response = model.generate_content(full_prompt)
            return response.text

        except Exception as e:
            return f"❌ Gemini Error: {str(e)}"

    def construct_conflict_prompt(self, conflict_data: dict) -> tuple[str, str]:
        """Formats the JSON conflict report for the LLM."""
        context_prompt = "CONFLICT DETECTED. The following JSON object describes a contradiction in the retrieved documents."

        user_content = f"""
        Structured Conflict Data:
        {json.dumps(conflict_data, indent=2)}

        Task:
        Explain this conflict to the user. Cite the 'source' and 'date' for every opposing claim.
        """
        return context_prompt, user_content

    def construct_normal_prompt(self, chunks: list, query: str) -> tuple[str, str]:
        """Standard RAG prompt."""
        context_prompt = "Answer the user's question based ONLY on the context provided below."

        context_text = "\n\n".join([
            f"--- Source: {c['metadata'].get('doc_id')} (Date: {c['metadata'].get('date')}) ---\n{c['content']}"
            for c in chunks
        ])

        user_content = f"""
        Context:
        {context_text}

        Question: {query}
        """
        return context_prompt, user_content


All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  loader.exec_module(module)


In [13]:
class ConflictAwareRAG:
    def __init__(self, indexer, gate, detector, llm_narrator):
        self.indexer = indexer
        self.gate = gate
        self.detector = detector
        self.llm = llm_narrator

    def process_query(self, query: str):
        print(f"🔹 Processing Query: '{query}'")

        # 1. RETRIEVAL (Hybrid)
        dense_res, sparse_indices = self.indexer.hybrid_search(query, top_k=5)

        # Helper to standardize chunks
        retrieved_chunks = []
        if dense_res['documents']:
            for i, text in enumerate(dense_res['documents'][0]):
                retrieved_chunks.append({
                    'id': dense_res['ids'][0][i],
                    'content': text,
                    'metadata': dense_res['metadatas'][0][i]
                })

        # 2. CONFLICT ENTRY GATE
        gate_decision = self.gate.evaluate(retrieved_chunks)

        if not gate_decision['trigger_conflict_pipeline']:
            print("✅ Gate Passed: No obvious conflict. Proceeding to Standard RAG.")
            sys_prompt, user_msg = self.llm.construct_normal_prompt(retrieved_chunks, query)
            answer = self.llm.generate(sys_prompt, user_msg)
            return {"type": "standard_answer", "content": answer}

        # 3. CONFLICT DETECTION (If Gate Triggered)
        print(f"⚠️ Gate Triggered! Reasons: {gate_decision['reasons']}")
        print("   -> Running Conflict Detection Engine...")

        conflict_report = self.detector.detect_conflicts(retrieved_chunks)

        # 4. FINAL OUTPUT GENERATION
        if conflict_report['conflict_detected']:
            print(f"   🚨 CONFLICT CONFIRMED. Found {conflict_report['count']} disagreements.")

            # Construct the "North Star" Structured Output
            # (The detector already outputs the dict, we just pass it to the LLM)
            sys_prompt, user_msg = self.llm.construct_conflict_prompt(conflict_report)
            explanation = self.llm.generate(sys_prompt, user_msg)

            return {
                "type": "conflict_exposed",
                "structured_data": conflict_report, # The raw JSON for the UI/Debug
                "narrative_explanation": explanation # The LLM text
            }
        else:
            print("   ℹ️ False Alarm: Gate triggered, but deeper analysis found no specific claim conflicts.")
            # Fallback to standard RAG
            sys_prompt, user_msg = self.llm.construct_normal_prompt(retrieved_chunks, query)
            answer = self.llm.generate(sys_prompt, user_msg)
            return {"type": "standard_answer", "content": answer}

In [31]:
# 1. Setup (Assuming 'rag_index' from previous steps is ready)
# Re-instantiate components to ensure they are linked correctly
gate = ConflictEntryGate(rag_index.dense_model)
detector = ConflictDetector(rag_index.dense_model)
llm = LLMNarrator(api_key="Your API KEY") # Pass your key here: api_key="sk-..."

# 2. Build the Engine
rag_engine = ConflictAwareRAG(rag_index, gate, detector, llm)

# 3. Run Test 1: Expect Conflict (if your data has diverging numbers)
print("\n--- TEST CASE 1: Numeric/Temporal Conflict ---")
result_1 = rag_engine.process_query("Money spent on Welfare of SC, ST, OBC and Minorities in 2023-24")
print("\n--- FINAL OUTPUT ---")
if result_1['type'] == 'conflict_exposed':
    print("JSON OUTPUT (Machine Readable):")
    print(json.dumps(result_1['structured_data'], indent=2))
    print("\nNARRATIVE OUTPUT (Human Readable):")
    print(result_1['narrative_explanation'])
else:
    print(result_1['content'])



--- TEST CASE 1: Numeric/Temporal Conflict ---
🔹 Processing Query: 'Money spent on Welfare of SC, ST, OBC and Minorities in 2023-24'
--- Dense Results ---
Doc: # Annexure 2:  Comparison of 2023-24 Budget Estima... | ID: 70b350813f59a0cf
Doc: # Expenditure in 2023-24

| Sectors               ... | ID: 4a7f9d31b2b81631
Doc: # Expenditure in 2024-25

| Sector                ... | ID: 40cf82521848f864
Doc: # Credibility of revised estimates

| Sectors     ... | ID: 8b9bb603d1a6cb46
Doc: # Expenditure in 2024-25

Committed expenditure: C... | ID: b89b5e85b07797f9

--- Sparse Results (BM25) ---
Score: 26.9336 | ID: 70b350813f59a0cf
Score: 24.5059 | ID: dceaf796b6f86b0e
Score: 24.0074 | ID: b35b3dbc1688ff77
Score: 23.9523 | ID: cb78be2347611e63
Score: 21.1693 | ID: 0b07b6b161535129
⚠️ Gate Triggered! Reasons: ["Multiple sources detected: {'budgetReport25-26', 'budgetReport24-25', 'budgetReport23-24'}", 'High semantic overlap between different documents']
   -> Running Conflict Detection Engi