In [None]:
!pip install docling

In [None]:
import os
from pathlib import Path
from docling.document_converter import DocumentConverter

# 1. Setup paths
input_dir = Path("/content/pdfs")
output_dir = Path("extracted_md")
output_dir.mkdir(exist_ok=True)

# 2. Initialize Docling Converter
converter = DocumentConverter()

def extract_pdfs():
    pdf_files = list(input_dir.glob("*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return

    print(f"Found {len(pdf_files)} files. Starting extraction...")

    for pdf_path in pdf_files:
        try:
            print(f"Processing: {pdf_path.name}...")

            # Convert PDF to Docling's internal format
            result = converter.convert(pdf_path)

            # Export to Markdown (best for your chunking strategy)
            md_output = result.document.export_to_markdown()

            # Save the file
            output_filename = pdf_path.stem + ".md"
            with open(output_dir / output_filename, "w", encoding="utf-8") as f:
                f.write(md_output)

        except Exception as e:
            print(f"Error processing {pdf_path.name}: {e}")

    print(f"\nSuccess! Extracted files are in: {output_dir}")

extract_pdfs()


In [None]:
!pip install -q sentence-transformers chromadb tiktoken

In [3]:
!pip install -q rank_bm25

In [10]:
import os
import glob
import re
import hashlib
import chromadb
import tiktoken
import numpy as np
from typing import List, Dict
from dataclasses import dataclass, asdict
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from tqdm.notebook import tqdm

# --- 1. Chunking & Data Structures ---

@dataclass
class Chunk:
    id: str
    content: str
    source_file: str
    section_path: list[str]
    token_count: int
    has_table: bool

class MarkdownChunker:
    """(Same robust chunker as before)"""
    HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
    TABLE_PATTERN = re.compile(r'^\|.+\|(?:\n^\|.+\|$)+', re.MULTILINE)

    def __init__(self, max_tokens: int = 512, model: str = "cl100k_base"):
        self.max_tokens = max_tokens
        self.tokenizer = tiktoken.get_encoding(model)

    def count_tokens(self, text: str) -> int:
        return len(self.tokenizer.encode(text))

    def generate_chunk_id(self, source: str, content: str, index: int) -> str:
        hash_input = f"{source}:{index}:{content[:100]}"
        return hashlib.sha256(hash_input.encode()).hexdigest()[:16]

    def parse_sections(self, content: str) -> list[dict]:
        lines = content.split('\n')
        sections = []
        current_section = {'level': 0, 'title': '', 'content_lines': [], 'has_table': False}
        i = 0
        while i < len(lines):
            line = lines[i]
            header_match = self.HEADER_PATTERN.match(line)
            if header_match:
                if current_section['content_lines'] or current_section['title']:
                    section_content = '\n'.join(current_section['content_lines'])
                    current_section['has_table'] = bool(self.TABLE_PATTERN.search(section_content))
                    sections.append({
                        'level': current_section['level'],
                        'title': current_section['title'],
                        'content': section_content.strip(),
                        'has_table': current_section['has_table']
                    })
                level = len(header_match.group(1))
                title = header_match.group(2).strip()
                current_section = {'level': level, 'title': title, 'content_lines': [], 'has_table': False}
            else:
                current_section['content_lines'].append(line)
            i += 1
        if current_section['content_lines'] or current_section['title']:
            section_content = '\n'.join(current_section['content_lines'])
            sections.append({
                'level': current_section['level'],
                'title': current_section['title'],
                'content': section_content.strip(),
                'has_table': bool(self.TABLE_PATTERN.search(section_content))
            })
        return sections

    def build_section_path(self, sections: list[dict], current_idx: int) -> list[str]:
        current = sections[current_idx]
        path = []
        if current['title']: path.append(current['title'])
        current_level = current['level']
        for i in range(current_idx - 1, -1, -1):
            section = sections[i]
            if section['level'] < current_level and section['title']:
                path.insert(0, section['title'])
                current_level = section['level']
                if current_level == 1: break
        return path

    def _build_header_prefix(self, section_path: list[str]) -> str:
        if not section_path: return ""
        lines = []
        for i, title in enumerate(section_path):
            lines.append('#' * (i + 1) + ' ' + title)
        return '\n'.join(lines) + '\n\n'

    def split_large_section(self, text: str, section_path: list[str], source_file: str, start_index: int):
        header_prefix = self._build_header_prefix(section_path)
        header_tokens = self.count_tokens(header_prefix)
        available_tokens = self.max_tokens - header_tokens
        paragraphs = re.split(r'\n\n+', text)
        current_chunk_parts = []
        current_tokens = 0
        chunk_index = start_index

        for para in paragraphs:
            para = para.strip()
            if not para: continue
            para_tokens = self.count_tokens(para)
            if current_tokens + para_tokens > available_tokens:
                if current_chunk_parts:
                    content = header_prefix + '\n\n'.join(current_chunk_parts)
                    yield Chunk(self.generate_chunk_id(source_file, content, chunk_index), content, source_file, section_path, self.count_tokens(content), False)
                    chunk_index += 1
                    current_chunk_parts = []
                    current_tokens = 0
            current_chunk_parts.append(para)
            current_tokens += para_tokens

        if current_chunk_parts:
            content = header_prefix + '\n\n'.join(current_chunk_parts)
            yield Chunk(self.generate_chunk_id(source_file, content, chunk_index), content, source_file, section_path, self.count_tokens(content), False)

    def chunk_document(self, content: str, source_file: str):
        sections = self.parse_sections(content)
        chunk_index = 0
        for i, section in enumerate(sections):
            section_path = self.build_section_path(sections, i)
            header_str = self._build_header_prefix(section_path)
            full_text = (header_str + section['content']).strip()
            token_count = self.count_tokens(full_text)

            if token_count <= self.max_tokens:
                yield Chunk(self.generate_chunk_id(source_file, full_text, chunk_index), full_text, source_file, section_path, token_count, section['has_table'])
                chunk_index += 1
            else:
                for chunk in self.split_large_section(section['content'], section_path, source_file, chunk_index):
                    yield chunk
                    chunk_index += 1

# --- 2. Hybrid System Class ---

class HybridRAG:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        # Dense Setup
        print(f"üîÑ Loading Dense Model: {model_name}...")
        self.embedding_model = SentenceTransformer(model_name)
        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
        self.collection = self.chroma_client.get_or_create_collection(name="doc_chunks")

        # Sparse Setup (BM25)
        # We must keep an in-memory map of ID -> Tokenized Text for BM25
        self.chunk_registry: Dict[str, str] = {}
        self.bm25 = None

        self.chunker = MarkdownChunker(max_tokens=512)

    def _tokenize(self, text: str) -> List[str]:
        """Simple tokenizer for BM25. Can be replaced with spacy or others."""
        return text.lower().split()

    def embed_and_store(self, markdown_text: str, source_name: str):
        """
        1. Chunks document
        2. Uploads to Chroma (Dense)
        3. Updates local registry for BM25 (Sparse)
        """
        chunks = list(self.chunker.chunk_document(markdown_text, source_name))
        if not chunks: return

        ids = [c.id for c in chunks]
        documents = [c.content for c in chunks]
        metadatas = [{
            "source": c.source_file,
            "token_count": c.token_count,
            "has_table": c.has_table,
            "section_path": " > ".join(c.section_path)
        } for c in chunks]

        # 1. Update In-Memory Registry (for BM25 later)
        for c in chunks:
            self.chunk_registry[c.id] = c.content

        # 2. Update Chroma (Dense)
        embeddings = self.embedding_model.encode(documents, convert_to_tensor=False).tolist()
        self.collection.upsert(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas)

        # Note: We do NOT rebuild BM25 here. It's too slow to do per-file.
        # Call build_bm25_index() after all files are loaded.

    def build_bm25_index(self):
        """Must be called after all documents are ingested."""
        print(f"üèóÔ∏è Building BM25 Index for {len(self.chunk_registry)} chunks...")

        # Create parallel lists of IDs and Tokenized Docs
        self.bm25_ids = list(self.chunk_registry.keys())
        corpus = [self._tokenize(self.chunk_registry[doc_id]) for doc_id in self.bm25_ids]

        self.bm25 = BM25Okapi(corpus)
        print("‚úÖ BM25 Index Ready.")

    def hybrid_query(self, query_text: str, top_k: int = 5, alpha: float = 0.5):
        """
        Performs Hybrid Search using Weighted Scoring.
        Alpha 1.0 = Pure Dense
        Alpha 0.0 = Pure Sparse
        """
        if self.bm25 is None:
            print("‚ö†Ô∏è BM25 Index not built! Running build_bm25_index() now...")
            self.build_bm25_index()

        # 1. Dense Search (Chroma)
        query_embedding = self.embedding_model.encode([query_text]).tolist()
        chroma_res = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k * 2 # Fetch more candidates for re-ranking
        )

        # Normalize Dense Scores (Distance -> Similarity)
        # Chroma returns distance by default for L2. If using cosine, it returns 1 - cos.
        # Assuming default L2: Lower is better. We need to invert it.
        # Ideally, we map scores to [0,1]. Here is a simplified normalization.
        dense_hits = {}
        if chroma_res['ids'] and chroma_res['ids'][0]:
            ids = chroma_res['ids'][0]
            distances = chroma_res['distances'][0]
            max_dist = max(distances) + 1e-6
            for doc_id, dist in zip(ids, distances):
                score = 1 - (dist / max_dist) # Simple inversion normalization
                dense_hits[doc_id] = score

        # 2. Sparse Search (BM25)
        tokenized_query = self._tokenize(query_text)
        bm25_scores = self.bm25.get_scores(tokenized_query)

        # Get Top-N BM25 IDs
        # We need to map integer indices back to string IDs
        top_n_indices = np.argsort(bm25_scores)[::-1][:top_k * 2]

        sparse_hits = {}
        max_bm25 = max(bm25_scores) + 1e-6
        for idx in top_n_indices:
            doc_id = self.bm25_ids[idx]
            score = bm25_scores[idx] / max_bm25 # Normalize to [0,1]
            sparse_hits[doc_id] = score

        # 3. Fuse Scores
        combined_scores = {}
        all_ids = set(dense_hits.keys()) | set(sparse_hits.keys())

        for doc_id in all_ids:
            d_score = dense_hits.get(doc_id, 0.0)
            s_score = sparse_hits.get(doc_id, 0.0)
            combined_scores[doc_id] = (d_score * alpha) + (s_score * (1 - alpha))

        # 4. Sort and Return
        sorted_ids = sorted(combined_scores, key=combined_scores.get, reverse=True)[:top_k]

        results = []
        for doc_id in sorted_ids:
            # Fetch content (we have it in memory or chroma)
            content = self.chunk_registry.get(doc_id, "Error: Content missing")
            results.append({
                "id": doc_id,
                "score": combined_scores[doc_id],
                "content": content,
                "dense_score": dense_hits.get(doc_id, 0.0),
                "sparse_score": sparse_hits.get(doc_id, 0.0)
            })

        return results

# --- 3. The Ingestion Loop ---

def ingest_directory(directory_path: str, rag_system):
    print(f"üìÇ Scanning '{directory_path}'...")
    md_files = glob.glob(os.path.join(directory_path, "**", "*.md"), recursive=True)

    if not md_files:
        print("‚ö†Ô∏è No markdown files found.")
        return

    print(f"üöÄ Processing {len(md_files)} files...")

    for file_path in tqdm(md_files):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()

            if not content.strip(): continue

            rel_path = os.path.relpath(file_path, directory_path)

            # This single call handles both Dense & Sparse prep
            rag_system.embed_and_store(content, rel_path)

        except Exception as e:
            print(f"‚ùå Error in {file_path}: {e}")

    # IMPORTANT: Finalize the sparse index
    rag_system.build_bm25_index()

# --- 4. Execution ---



In [None]:
# 1. Initialize
rag = HybridRAG()

# 2. Ingest your directory (Uses the function from previous step)
# Make sure to run the 'ingest_directory' function definition from the previous turn!
ingest_directory('/content/extracted_md', rag)

# 3. IMPORTANT: Build the Index!
# BM25 requires the full corpus to calculate IDF weights.
rag.build_bm25_index()

# 4. Test Hybrid Search
query = "retry logic for payment gateways"
results = rag.hybrid_query(query, top_k=3, alpha=0.5)

print(f"\nüîç Hybrid Results for: '{query}'\n")
for res in results:
    print(f"üìÑ [{res['id']}] Score: {res['score']:.4f} (Dense: {res['dense_score']:.2f}, Sparse: {res['sparse_score']:.2f})")
    print(f"   {res['content'][:100]}...")
    print("-" * 40)

In [None]:
# @title 1. Install & Load Spacy (for Entity Extraction)
!pip install -q spacy
!python -m spacy download en_core_web_sm

import spacy
import numpy as np
from typing import List, Dict, Any

# Load the small English model
nlp = spacy.load("en_core_web_sm")

print("‚úÖ Spacy loaded for entity extraction.")

In [12]:
# @title 2. The Active Retrieval Control System

class RetrievalJudge:
    def __init__(self,
                 min_max_score: float = 0.45,  # Threshold 1: At least one good hit
                 min_avg_score: float = 0.2,   # Threshold 2: Top-k shouldn't be garbage
                 min_entity_overlap: int = 2): # Threshold 3: Must contain specific terms
        self.min_max_score = min_max_score
        self.min_avg_score = min_avg_score
        self.min_entity_overlap = min_entity_overlap

    def _extract_entities(self, text: str) -> set:
        """Extracts Nouns and Proper Nouns as 'Entities'."""
        doc = nlp(text.lower())
        # Keep Nouns (NOIJ) and Proper Nouns (PROPN) longer than 2 chars
        return {token.text for token in doc if token.pos_ in ['NOUN', 'PROPN'] and len(token.text) > 2}

    def evaluate(self, query: str, results: List[Dict]) -> Dict:
        """
        Judges the quality of the retrieved set.
        Returns: { 'pass': bool, 'reason': str, 'metrics': dict }
        """
        if not results:
            return {"pass": False, "reason": "No results found", "metrics": {}}

        # 1. Calculate Score Metrics
        scores = [r['score'] for r in results]
        max_score = max(scores)
        avg_score = np.mean(scores)

        # 2. Calculate Entity Overlap
        query_entities = self._extract_entities(query)

        # We check if the Top 1 chunk contains the query entities
        # (Strictness: Can be relaxed to check Top 3)
        top_chunk_text = results[0]['content'].lower()
        overlap_count = sum(1 for entity in query_entities if entity in top_chunk_text)

        metrics = {
            "max_score": round(max_score, 3),
            "avg_score": round(avg_score, 3),
            "overlap_count": overlap_count,
            "query_entities": list(query_entities)
        }

        # 3. Decision Logic (The "Gates")
        if max_score < self.min_max_score:
            return {"pass": False, "reason": f"Max similarity too low ({max_score} < {self.min_max_score})", "metrics": metrics}

        if avg_score < self.min_avg_score:
            return {"pass": False, "reason": f"Avg quality too low ({avg_score} < {self.min_avg_score})", "metrics": metrics}

        # Only enforce overlap if the query actually HAD entities
        if len(query_entities) > 0 and overlap_count < self.min_entity_overlap:
             # Fallback: If score is REALLY high, maybe ignore overlap? (Optional logic)
             if max_score < 0.75:
                return {"pass": False, "reason": f"Insufficient entity overlap ({overlap_count} < {self.min_entity_overlap})", "metrics": metrics}

        return {"pass": True, "reason": "All checks passed", "metrics": metrics}


class QueryRewriter:
    def __init__(self, llm_client=None):
        self.llm_client = llm_client # Pass your Gemini/OpenAI client here

    def generate_rewrite(self, original_query: str, fail_reason: str, history: List[str]) -> str:
        """
        Uses an LLM to rewrite the query based on the failure reason.
        """

        # --- PROMPT TEMPLATE ---
        system_prompt = f"""
        You are a Query Engineering AI. The user's original query failed retrieval.

        Original Query: "{original_query}"
        Failure Reason: "{fail_reason}"
        Previous Attempts: {history}

        Task: Generate a BETTER query to satisfy the retrieval system.
        - If failure was "Max similarity too low", try using broader keywords or synonyms.
        - If failure was "Entity overlap", include the specific missing technical terms explicitly.
        - If failure was "Avg quality too low", make the query more specific/targeted.

        Output ONLY the rewritten query text. Do not add explanations.
        """

        # --- LLM CALL PLACEHOLDER ---
        # Replace this block with: response = client.generate(system_prompt)

        print(f"\nü§ñ [LLM] Rewriting query... (Reason: {fail_reason})")

        # --- SIMULATED LOGIC FOR DEMO (Remove this when using real LLM) ---
        # This simulates an LLM improving the query
        if "entity" in fail_reason.lower():
            return original_query + " specifications definitions" # simplistic expansion
        elif "similarity" in fail_reason.lower():
            return "detailed " + original_query # simplistic expansion
        else:
            return original_query + " context"
        # -------------------------------------------------------------


class ActiveRetrievalLoop:
    def __init__(self, rag_system, judge: RetrievalJudge, rewriter: QueryRewriter):
        self.rag = rag_system
        self.judge = judge
        self.rewriter = rewriter
        self.max_retries = 3

    def run(self, user_query: str):
        current_query = user_query
        query_history = []

        print(f"üöÄ Starting Active Retrieval for: '{user_query}'")

        for attempt in range(self.max_retries + 1):
            print(f"\n--- Attempt {attempt + 1} ---")
            print(f"üîç Executing Query: '{current_query}'")

            # 1. Retrieve
            # Note: We assume 'rag.hybrid_query' exists from previous step
            results = self.rag.hybrid_query(current_query, top_k=5)

            # 2. Judge
            evaluation = self.judge.evaluate(current_query, results)
            metrics = evaluation['metrics']

            print(f"üìä Metrics: MaxSim={metrics.get('max_score')} | Avg={metrics.get('avg_score')} | Overlap={metrics.get('overlap_count')}")

            # 3. Check Success
            if evaluation['pass']:
                print("‚úÖ Retrieval QUALITY MET! Proceeding to answer generation.")
                return {
                    "final_query": current_query,
                    "results": results,
                    "attempts": attempt + 1,
                    "status": "success"
                }

            print(f"‚ùå Quality Check Failed: {evaluation['reason']}")

            # 4. Terminate if max retries reached
            if attempt == self.max_retries:
                print("üõë Max retries reached. Returning best effort results.")
                return {
                    "final_query": current_query,
                    "results": results,
                    "attempts": attempt + 1,
                    "status": "failed_max_retries"
                }

            # 5. Rewrite
            query_history.append(current_query)
            current_query = self.rewriter.generate_rewrite(user_query, evaluation['reason'], query_history)

            # Check for oscillating/duplicate queries
            if current_query in query_history:
                print("‚ö†Ô∏è Loop Detected: Rewriter produced a duplicate query. Stopping.")
                break

        return {"status": "failed_loop_error", "results": results}

In [None]:
# @title 1. Install Google GenAI SDK
!pip install -q -U google-generativeai

import google.generativeai as genai
import os

# --- ENTER YOUR API KEY HERE ---
# You can get one at https://aistudio.google.com/app/apikey
os.environ["GOOGLE_API_KEY"] = "Your GEMINI API KEY"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [None]:
# @title 2. Gemini Query Rewriter

class GeminiQueryRewriter:
    def __init__(self, model_name: str = "gemini-2.5-flash"):
        self.model = genai.GenerativeModel(model_name)

    def generate_rewrite(self, original_query: str, fail_reason: str, history: List[str]) -> str:
        """
        Calls Gemini to rewrite the query based on the failure diagnostic.
        """

        # Construct a prompt that treats the LLM as a control system component
        prompt = f"""
        You are a Query Refinement Engine for a RAG system. The dataset is about Terms and Conditions of the Indian Railways.
        The system failed to retrieve relevant documents for the user's query.

        --- DIAGNOSTICS ---
        Original Query: "{original_query}"
        Failure Reason: {fail_reason}
        Prior Failed Attempts: {history}

        --- INSTRUCTIONS ---
        Based on the failure reason, generate ONE single improved search query.

        1. If reason is "Max similarity too low":
           - The query is likely too specific or uses wrong terminology.
           - Strategy: Generalize terms, remove noisy adjectives, or use domain synonyms.

        2. If reason is "Insufficient entity overlap":
           - The query lacks specific nouns found in the corpus.
           - Strategy: Add specific technical nouns, identifiers, or standard terminology related to the topic.

        3. If reason is "Avg quality too low":
           - The query is ambiguous and fetching mixed results.
           - Strategy: Add context or constraints to narrow the scope.

        --- OUTPUT ---
        Output ONLY the rewritten query string. Do not output explanations or quotes.
        """

        try:
            # Call Gemini
            response = self.model.generate_content(prompt)
            rewritten_query = response.text.strip().replace('"', '').replace("'", "")

            print(f"ü§ñ [Gemini] Reason: '{fail_reason}' | Rewrite: '{rewritten_query}'")
            return rewritten_query

        except Exception as e:
            print(f"‚ö†Ô∏è Gemini Error: {e}")
            # Fallback strategy if API fails: just append a generic term
            return f"{original_query} specification"

# --- Update the Loop Setup ---

# 1. Initialize Components
# We use the same Judge and RAG pipeline from before
judge = RetrievalJudge(min_max_score=0.45, min_entity_overlap=1)

# 2. Initialize the NEW Gemini Rewriter
gemini_rewriter = GeminiQueryRewriter()

# 3. Create the Loop
active_rag = ActiveRetrievalLoop(rag, judge, gemini_rewriter)

# 4. Run it
# Example: A query that might fail initially due to lack of specific terms
print("--- Starting Test Run ---")
final_result = active_rag.run("When are the charts prepared?")

In [None]:
# @title 3. Final Answer Generator (Grounded)

class AnswerGenerator:
    def __init__(self, model_name: str = "gemini-2.5-flash"):
        self.model = genai.GenerativeModel(model_name)

    def _format_context(self, results: List[Dict]) -> str:
        """
        Formats retrieved chunks into a clean context block for the LLM.
        """
        context_str = ""
        for i, res in enumerate(results):
            # Extract metadata safely
            source = res.get('source_file', res.get('metadata', {}).get('source', 'Unknown Source'))
            content = res.get('content', '')

            context_str += f"""
            --- CONTEXT BLOCK {i+1} ---
            Source: {source}
            Content:
            {content}
            ---------------------------
            """
        return context_str

    def generate_answer(self, query: str, results: List[Dict]) -> str:
        """
        Generates the final answer using strictly the provided context.
        """
        if not results:
            return "I could not find enough information in the documents to answer your question."

        context_text = self._format_context(results)

        # --- SYSTEM PROMPT ---
        system_prompt = f"""
        You are a Technical Documentation Assistant.
        Answer the user's question using ONLY the context blocks provided below.

        --- STRICT RULES ---
        1. **Grounding:** Do not use outside knowledge. If the answer is not in the context, say "I cannot find this information in the provided documents."
        2. **Citation:** When you state a fact, mention the source file name (e.g., [Source: specs/v1.md]).
        3. **Tone:** Professional, concise, and technical.
        4. **Formatting:** Use Markdown lists or code blocks where appropriate.

        --- RETRIEVED CONTEXT ---
        {context_text}

        --- USER QUESTION ---
        {query}
        """

        try:
            response = self.model.generate_content(system_prompt)
            return response.text
        except Exception as e:
            return f"Error generating answer: {str(e)}"

# --- 4. Tying It All Together (The Full Pipeline) ---

# 1. Initialize Generator
answer_gen = AnswerGenerator()

# 2. Run the Active Loop (from previous steps)
# This handles the Query -> Judge -> Rewrite -> Retrieve cycle
user_query = "Quota for ticket booking"
loop_result = active_rag.run(user_query)

print(f"\n‚úÖ Loop Finished with status: {loop_result['status']}")

# 3. Generate Final Answer (if successful)
if loop_result['results']:
    print("\nüìù Generating Final Answer...\n")
    final_answer = answer_gen.generate_answer(
        query=user_query,
        results=loop_result['results']
    )

    print("=" * 60)
    print(final_answer)
    print("=" * 60)
else:
    print("‚ùå Failed to retrieve valid documents. Cannot answer.")