In [None]:
from __future__ import annotations
import logging, os, re, sys, pickle, json, hashlib, uuid
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
import numpy as np
import faiss

try:
    from langchain_core.documents import Document
    from langchain_community.document_loaders import (
        PyMuPDFLoader,
        Docx2txtLoader,
        UnstructuredPowerPointLoader,
        TextLoader,
        UnstructuredMarkdownLoader,
    )
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except Exception as e:
    print("This script requires langchain and compatible loaders.")
    raise

try:
    from sentence_transformers import SentenceTransformer
    _SBERT_OK = True
except Exception:
    _SBERT_OK = False

# [MODIFIED] Import for Reranker
try:
    from sentence_transformers import CrossEncoder
    _CROSS_ENCODER_OK = True
except Exception:
    _CROSS_ENCODER_OK = False


# --- Logging ---
LOG_DIR = Path("logs")
LOG_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    handlers=[
        logging.FileHandler(LOG_DIR / "ingest.log", mode="a", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger("rag.ingest")

In [None]:
@dataclass
class IngestConfig:
    root_dir: Path
    subject: str = "General"
    chunk_sizes: Tuple[int, ...] = (2048, 512, 128)
    overlap_ratio: float = 0.12
    max_overlap: int = 220
    models: Tuple[str, ...] = ("all-mpnet-base-v2", "BAAI/bge-base-en-v1.5")
    reranker_model: Optional[str] = "BAAI/bge-reranker-base"


LOADER_MAP = {
    ".pdf": PyMuPDFLoader,
    ".docx": Docx2txtLoader,
    ".pptx": UnstructuredPowerPointLoader,
    ".txt": TextLoader,
    ".md": UnstructuredMarkdownLoader,
}

STOPWORDS = set("""
a an the and or but if then else when while at by for with about against between
into through during before after above below to from up down in out on off over under
again further then once here there all any both each few more most other some such
no nor not only own same so than too very s t can will just don should now is are
am was were be been being of this that these those it its as i you he she they we
me him her them my your their our
""".split())

_ws_re = re.compile(r"\s+")
_url_re = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
_noninformative_re = re.compile(r"^[\s\-\*\d\.|]+$")

In [None]:

def normalize_whitespace(text: str) -> str:
    return _ws_re.sub(" ", text).strip()

def simple_tokenize(text: str) -> List[str]:
    text = text.lower()
    text = _url_re.sub(" ", text)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return [t for t in text.split() if t and t not in STOPWORDS]

def remove_noninformative(text: str) -> str:
    kept = []
    for line in text.splitlines():
        if not line.strip():
            continue
        if _noninformative_re.match(line.strip()):
            continue
        kept.append(line)
    return "\n".join(kept)

def preprocess_text(raw: str) -> str:
    return normalize_whitespace(remove_noninformative(raw).lower())


def jaccard(a: Iterable[str], b: Iterable[str]) -> float:
    sa, sb = set(a), set(b)
    if not sa and not sb:
        return 1.0
    return len(sa & sb) / max(1, len(sa | sb))

def dedup_documents(docs: List[Document], near_threshold: float = 0.92) -> List[Document]:
    seen_hashes, kept, token_cache = set(), [], []
    for doc in docs:
        txt = doc.page_content
        h = hash(txt)
        if h in seen_hashes:
            continue
        tokens = simple_tokenize(txt)
        if any(jaccard(tokens, toks) >= near_threshold for toks in token_cache):
            continue
        seen_hashes.add(h)
        kept.append(doc)
        token_cache.append(tokens)
    return kept

In [None]:
class DocumentProcessor:
    def __init__(self, cfg: IngestConfig):
        self.cfg = cfg
        self._docid_by_path: Dict[str, str] = {}
        self.cfg.chunk_sizes = tuple(sorted(cfg.chunk_sizes, reverse=True))

    @staticmethod
    def _stable_id(seed: str, prefix: str) -> str:
        h = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
        return f"{prefix}-{h}"

    def _loader_for(self, path: Path):
        ext = path.suffix.lower()
        if ext in LOADER_MAP:
            return LOADER_MAP[ext]
        raise ValueError(f"Unsupported file format: {ext}")

    def _augment_metadata(self, doc: Document, file_path: Path, seq_in_file: int):
        file_key = str(file_path.resolve())
        if file_key not in self._docid_by_path:
            self._docid_by_path[file_key] = self._stable_id(file_key, "doc")
        doc_id = self._docid_by_path[file_key]

        doc.metadata.update({
            "subject": self.cfg.subject,
            "source": file_path.name,
            "context": doc.page_content[:200],
            "timestamp": datetime.now().isoformat(),
            "file_path": file_key,
            "doc_id": doc_id,
            "doc_seq": seq_in_file,
        })

    def _preprocess_doc(self, doc: Document) -> Document:
        doc.page_content = preprocess_text(doc.page_content)
        return doc

    def load_all(self) -> List[Document]:
        all_docs = []
        for root, _, files in os.walk(self.cfg.root_dir):
            for fname in files:
                path = Path(root) / fname
                try:
                    if path.suffix.lower() in LOADER_MAP:
                        loader = self._loader_for(path)(str(path))
                        docs = loader.load()
                        for i, d in enumerate(docs):
                            self._augment_metadata(d, path, i)
                            self._preprocess_doc(d)
                        all_docs.extend(docs)
                except Exception as e:
                    logger.error(f"Error processing {path}: {e}")
        deduped = dedup_documents(all_docs)
        logger.info(f"Loaded {len(deduped)} unique documents (deduped from {len(all_docs)})")
        return deduped

    def hierarchical_chunks(self, documents: List[Document]) -> Dict[int, List[Document]]:
        results: Dict[int, List[Document]] = {}
        separators = ["\n\n", "\n", ". ", " ", ""]

        if not self.cfg.chunk_sizes:
            return {}

        parent_size = self.cfg.chunk_sizes[0]
        overlap = min(int(parent_size * self.cfg.overlap_ratio), self.cfg.max_overlap)
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=parent_size,
            chunk_overlap=overlap,
            separators=separators,
            add_start_index=True,
        )
        
        parent_chunks: List[Document] = []
        for d in documents:
            original_doc_id = d.metadata.get("doc_id", self._stable_id(d.metadata.get("file_path","") or uuid.uuid4().hex, "doc"))
            
            split = splitter.split_documents([d])
            for i, ch in enumerate(split):
                start = ch.metadata.get("start_index", 0)
                seed = f"{original_doc_id}|{parent_size}|{i}|{start}|{len(ch.page_content)}"
                chunk_id = self._stable_id(seed, "chunk")
                
                ch.metadata.update({
                    "chunk_size": parent_size,
                    "chunk_index": i,
                    "level": parent_size,
                    "parent_doc_id": original_doc_id, 
                    "chunk_id": chunk_id,
                    "parent_chunk_id": None, 
                    **{k: v for k, v in d.metadata.items() if k not in ch.metadata},
                })
                parent_chunks.append(ch)

        pruned_parents = dedup_documents(parent_chunks)
        logger.info(f"Chunk size={parent_size}: {len(pruned_parents)} unique chunks (overlap={overlap})")
        results[parent_size] = pruned_parents
        
        previous_level_chunks = pruned_parents 

        for size in self.cfg.chunk_sizes[1:]: 
            overlap = min(int(size * self.cfg.overlap_ratio), self.cfg.max_overlap)
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=size,
                chunk_overlap=overlap,
                separators=separators,
                add_start_index=True,
            )
            
            current_level_chunks: List[Document] = []
            
            for parent_chunk in previous_level_chunks:
                parent_chunk_id = parent_chunk.metadata["chunk_id"]
                original_doc_id = parent_chunk.metadata["parent_doc_id"] 

                parent_doc_for_splitting = Document(
                    page_content=parent_chunk.page_content
                )
                split = splitter.split_documents([parent_doc_for_splitting]) 
                
                for i, ch in enumerate(split):
                    start = ch.metadata.get("start_index", 0)
                    seed = f"{parent_chunk_id}|{size}|{i}|{start}|{len(ch.page_content)}"
                    chunk_id = self._stable_id(seed, "chunk")
                    
                    new_metadata = parent_chunk.metadata.copy()
                    
                    new_metadata.update({
                        "chunk_size": size,
                        "chunk_index": i,
                        "level": size,
                        "parent_doc_id": original_doc_id,
                        "chunk_id": chunk_id,
                        "parent_chunk_id": parent_chunk_id, 
                        "start_index": start,
                    })
                    
                    child_chunk = Document(
                        page_content=ch.page_content,
                        metadata=new_metadata
                    )
                    current_level_chunks.append(child_chunk)

            pruned_children = dedup_documents(current_level_chunks)
            logger.info(f"Chunk size={size}: {len(pruned_children)} unique chunks (overlap={overlap})")
            results[size] = pruned_children
            
            previous_level_chunks = pruned_children 
        
        return results

In [None]:

class FaissVectorStore:
    def __init__(self, dimension: int):
        self.dimension = int(dimension)
        self.index = faiss.IndexFlatL2(self.dimension)
        self.documents: List[Document] = []

    def add_documents(self, documents: List[Document], embeddings: np.ndarray):
        self.index.add(embeddings.astype(np.float32))
        self.documents.extend(documents)

    def save(self, path: Path):
        path.mkdir(parents=True, exist_ok=True)
        faiss.write_index(self.index, str(path / "index.faiss"))
        with open(path / "docs.pkl", "wb") as f:
            pickle.dump(self.documents, f)
        manifest = []
        for d in self.documents:
            md = d.metadata
            manifest.append({
                "chunk_id": md.get("chunk_id"),
                "parent_chunk_id": md.get("parent_chunk_id"),
                "parent_doc_id": md.get("parent_doc_id"),
                "doc_id": md.get("doc_id"),
                "source": md.get("source"),
                "chunk_size": md.get("chunk_size"),
                "chunk_index": md.get("chunk_index"),
                "file_path": md.get("file_path"),
                "subject": md.get("subject"),
            })
        with open(path / "manifest.json", "w", encoding="utf-8") as f:
            json.dump(manifest, f, ensure_ascii=False, indent=2)

    @classmethod
    def load(cls, path: Path) -> "FaissVectorStore":
        index = faiss.read_index(str(path / "index.faiss"))
        with open(path / "docs.pkl", "rb") as f:
            docs = pickle.load(f)
        obj = cls(index.d)
        obj.index, obj.documents = index, docs
        return obj

    def similarity_search(self, query_embedding: np.ndarray, k: int = 4):
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        k = min(k, self.index.ntotal)
        if k == 0:
            return [], []
        D, I = self.index.search(query_embedding.astype(np.float32), k)
        return [self.documents[i] for i in I[0]], [float(d) for d in D[0]]


class EmbeddingManager:
    def __init__(self, model_names: Iterable[str]):
        if not _SBERT_OK:
            raise RuntimeError("sentence-transformers is required")
        self.models = {m: SentenceTransformer(m) for m in model_names}

    def encode(self, model: str, texts: List[str]) -> np.ndarray:
        return np.array(self.models[model].encode(texts, batch_size=32, show_progress_bar=False))

    def dim(self, model: str) -> int:
        vec = self.encode(model, [""])
        return int(vec.shape[1])


class Reranker:
    def __init__(self, model_name: str, batch_size: int = 8):
        self.model_name = model_name
        if not _CROSS_ENCODER_OK:
            logger.error("CrossEncoder not found. Reranking will be disabled.")
            self.model = None
            return
        
        try:
            self.model = CrossEncoder(model_name)
            self.batch_size = batch_size
            logger.info(f"Loaded Reranker model: {model_name}")
        except Exception as e:
            logger.error(f"Failed to load Reranker model {model_name}. Reranking disabled. Error: {e}")
            self.model = None

    def rerank(self, query: str, documents: List[Document], top_k: int = 5) -> List[Document]:
        if not self.model or not documents:
            return documents[:top_k]

        pairs = [(query, doc.page_content) for doc in documents]
        scores = self.model.predict(pairs, batch_size=self.batch_size, show_progress_bar=False)
        
        scored_docs = list(zip(scores, documents))
        scored_docs.sort(key=lambda x: x[0], reverse=True)
        
        return [doc for score, doc in scored_docs[:top_k]]


class RAGIndexer:
    def __init__(self, cfg: IngestConfig):
        self.cfg = cfg
        self.embed_mgr = EmbeddingManager(cfg.models)
        self.vectorstores: Dict[Tuple[str, int], FaissVectorStore] = {}

    def build_indexes(self, chunks_by_size: Dict[int, List[Document]], save_dir: Path):
        save_dir.mkdir(parents=True, exist_ok=True)
        for model_name in self.cfg.models:
            dim = self.embed_mgr.dim(model_name)
            for size, docs in chunks_by_size.items():
                if not docs:
                    continue
                logger.info(f"Embedding {len(docs)} chunks (size={size}) with {model_name}")
                embeddings = self.embed_mgr.encode(model_name, [d.page_content for d in docs])
                vs = FaissVectorStore(dimension=dim)
                vs.add_documents(docs, embeddings)
                
                model_path_name = re.sub(r'[^a-zA-Z0-9_-]', '', model_name)
                out_path = save_dir / f"{model_path_name}_{size}"
                
                vs.save(out_path)
                self.vectorstores[(model_name, size)] = vs
                logger.info(f"Saved FAISS index -> {out_path}")

In [None]:
def hierarchical_search(
    query: str,
    indexer: RAGIndexer,
    embed_mgr: EmbeddingManager,
    model_name: str = "all-mpnet-base-v2",
    reranker: Optional[Reranker] = None,
    top_k_parent: int = 3,
    top_k_intermediate: int = 5,
    top_k_final: int = 5,       
) -> List[Document]:
    """
    MODIFIED: Hierarchical retrieval (Top-Down) with Reranker at the final step.
    1. Search top-level chunks (e.g., 2048).
    2. Find all child chunks (e.g., 512) of those top hits.
    3. Re-rank (Vector) the child chunks against the query.
    4. Find all grand-child chunks (e.g., 128) of those new hits.
    5. Re-rank (CrossEncoder) the grand-child chunks and return them.
    """
    q_emb = embed_mgr.encode(model_name, [query])[0]
    q_emb_2d = q_emb.reshape(1, -1).astype(np.float32)

    sizes = sorted(indexer.cfg.chunk_sizes, reverse=True)
    
    coarse_size = sizes[0]
    coarse_vs = indexer.vectorstores.get((model_name, coarse_size))
    if coarse_vs is None:
        raise ValueError(f"No {coarse_size}-level index found.")
        
    coarse_hits, _ = coarse_vs.similarity_search(q_emb, k=top_k_parent)
    parent_chunk_ids = {h.metadata["chunk_id"] for h in coarse_hits}

    if len(sizes) < 2:
        return coarse_hits 

    medium_size = sizes[1]
    medium_vs = indexer.vectorstores.get((model_name, medium_size))
    if medium_vs is None:
        raise ValueError(f"No {medium_size}-level index found.")
        
    medium_candidates = [
        d for d in medium_vs.documents 
        if d.metadata.get("parent_chunk_id") in parent_chunk_ids
    ]
    
    if not medium_candidates:
        logger.warning(f"No medium-level ({medium_size}) candidates found. Returning coarse hits.")
        return coarse_hits

    medium_texts = [d.page_content for d in medium_candidates]
    medium_embs = embed_mgr.encode(model_name, medium_texts).astype(np.float32)
    
    medium_rerank_index = faiss.IndexFlatL2(medium_embs.shape[1])
    medium_rerank_index.add(medium_embs)
    k_medium = min(top_k_intermediate, len(medium_candidates))
    D, I = medium_rerank_index.search(q_emb_2d, k=k_medium)
    
    medium_hits = [medium_candidates[i] for i in I[0]]
    child_chunk_ids = {h.metadata["chunk_id"] for h in medium_hits}

    if len(sizes) < 3:
        return medium_hits 

    fine_size = sizes[2]
    fine_vs = indexer.vectorstores.get((model_name, fine_size))
    if fine_vs is None:
        logger.info(f"No fine-level ({fine_size}) index found. Returning medium hits.")
        return medium_hits

    fine_candidates = [
        d for d in fine_vs.documents 
        if d.metadata.get("parent_chunk_id") in child_chunk_ids
    ]
    
    if not fine_candidates:
        logger.warning(f"No fine-level ({fine_size}) candidates found. Returning medium hits.")
        return medium_hits

    if reranker and reranker.model:
        logger.info(f"Reranking {len(fine_candidates)} fine candidates with {reranker.model_name}...")
        fine_hits = reranker.rerank(query, fine_candidates, top_k=top_k_final)
    
    else:
        if reranker:
            logger.warning("Reranker provided but model not loaded. Falling back to vector re-rank.")
        else:
            logger.info(f"No reranker. Using vector similarity for final re-rank.")
            
        fine_texts = [d.page_content for d in fine_candidates]
        fine_embs = embed_mgr.encode(model_name, fine_texts).astype(np.float32)

        fine_rerank_index = faiss.IndexFlatL2(fine_embs.shape[1])
        fine_rerank_index.add(fine_embs)
        k_fine = min(top_k_final, len(fine_candidates))
        D, I = fine_rerank_index.search(q_emb_2d, k=k_fine)
        
        fine_hits = [fine_candidates[i] for i in I[0]]
    
    return fine_hits

In [None]:
cfg = IngestConfig(
    root_dir=Path(r"C:\Users\vsai2\Documents\LAMA\Major Project\Vals\Docs"),
    subject="Geography - Natural Resources",
    models=("all-mpnet-base-v2","BAAI/bge-base-en-v1.5"), 
    chunk_sizes=(2048, 512, 128),
    reranker_model="BAAI/bge-reranker-base" 
)

In [None]:
docproc = DocumentProcessor(cfg)
docs = docproc.load_all()

chunks_by_size = docproc.hierarchical_chunks(docs)

indexer = RAGIndexer(cfg)
save_dir = Path("faiss_store_nested") 
indexer.build_indexes(chunks_by_size, save_dir)
logger.info("✅ Vector stores built and saved.")

2025-10-25 20:59:31,209 [INFO] rag.ingest: Loaded 440 unique documents (deduped from 440)
2025-10-25 20:59:34,172 [INFO] rag.ingest: Chunk size=2048: 730 unique chunks (overlap=220)
2025-10-25 20:59:48,188 [INFO] rag.ingest: Chunk size=512: 2476 unique chunks (overlap=61)
2025-10-25 21:02:11,705 [INFO] rag.ingest: Chunk size=128: 9752 unique chunks (overlap=15)
2025-10-25 21:02:11,708 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:02:11,708 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-mpnet-base-v2
2025-10-25 21:02:15,422 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:02:15,422 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
2025-10-25 21:02:19,385 [INFO] rag.ingest: Embedding 730 chunks (size=2048) with all-mpnet-base-v2
2025-10-25 21:03:16,018 [INFO] rag.ingest: Saved FAISS index -> faiss

In [None]:
model_name = cfg.models[0]
embed_mgr = EmbeddingManager([model_name]) 
query = "Impact of mining on the environment"

reranker = None
if cfg.reranker_model:
    if not _CROSS_ENCODER_OK:
        logger.warning("CrossEncoder not found. Reranking will be disabled.")
    else:
        reranker = Reranker(cfg.reranker_model)
        if not reranker.model:
            reranker = None

try:
    hits = hierarchical_search(
        query=query,
        indexer=indexer,
        embed_mgr=embed_mgr,
        model_name=model_name,
        reranker=reranker,        
        top_k_parent=3,          
        top_k_intermediate=5,    
        top_k_final=5,            
    )
    
    print("\n--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---")
    if reranker:
        print(f"--- (Reranked final 128-level with {cfg.reranker_model}) ---")
    else:
        print("--- (No Reranker Used - Fallback to Vector Search) ---")

    for i, d in enumerate(hits):
        print(f"[{i+1}] {d.metadata.get('source', '?')} | chunk_size={d.metadata.get('chunk_size')}")
        print(f"  chunk_id={d.metadata.get('chunk_id')}")
        print(f"  parent_chunk={d.metadata.get('parent_chunk_id')}")
        print(f"  parent_doc={d.metadata.get('parent_doc_id')}")
        print("  " + d.page_content[:250].replace("\n", " "), "...\n")
        
except ValueError as e:
    logger.error(f"Search failed: {e}")
except Exception as e:
    logger.error(f"An unexpected error occurred during search: {e}")

2025-10-25 21:11:43,797 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:11:43,798 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-mpnet-base-v2
2025-10-25 21:11:49,260 [INFO] sentence_transformers.cross_encoder.CrossEncoder: Use pytorch device: cpu
2025-10-25 21:11:49,514 [INFO] rag.ingest: Loaded Reranker model: BAAI/bge-reranker-base
2025-10-25 21:11:49,903 [INFO] rag.ingest: Reranking 23 fine candidates with BAAI/bge-reranker-base...

--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---
--- (Reranked final 128-level with BAAI/bge-reranker-base) ---
[1] Geography of Natural Resource and Management course code GeEs1019.pdf | chunk_size=128
  chunk_id=chunk-cbc9641e7d6a10cd
  parent_chunk=chunk-068d2c0b45e01b6d
  parent_doc=doc-a38890677bc460cd
  . physical changes and pollution of land, soil, water and air associated with mining operations, directly or indirectly affects ...

[2] Geography of Natu

In [None]:
model_name = cfg.models[1]
embed_mgr = EmbeddingManager([model_name]) 
query = "Impact of mining on the environment"

reranker = None
if cfg.reranker_model:
    if not _CROSS_ENCODER_OK:
        logger.warning("CrossEncoder not found. Reranking will be disabled.")
    else:
        reranker = Reranker(cfg.reranker_model)
        if not reranker.model:
            reranker = None

try:
    hits = hierarchical_search(
        query=query,
        indexer=indexer,
        embed_mgr=embed_mgr,
        model_name=model_name,
        reranker=reranker,        
        top_k_parent=3,          
        top_k_intermediate=5,     
        top_k_final=5,           
    )
    
    print("\n--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---")
    if reranker:
        print(f"--- (Reranked final 128-level with {cfg.reranker_model}) ---")
    else:
        print("--- (No Reranker Used - Fallback to Vector Search) ---")

    for i, d in enumerate(hits):
        print(f"[{i+1}] {d.metadata.get('source', '?')} | chunk_size={d.metadata.get('chunk_size')}")
        print(f"  chunk_id={d.metadata.get('chunk_id')}")
        print(f"  parent_chunk={d.metadata.get('parent_chunk_id')}")
        print(f"  parent_doc={d.metadata.get('parent_doc_id')}")
        print("  " + d.page_content[:250].replace("\n", " "), "...\n")
        
except ValueError as e:
    logger.error(f"Search failed: {e}")
except Exception as e:
    logger.error(f"An unexpected error occurred during search: {e}")

2025-10-25 21:12:05,694 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:12:05,695 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
2025-10-25 21:12:11,077 [INFO] sentence_transformers.cross_encoder.CrossEncoder: Use pytorch device: cpu
2025-10-25 21:12:11,331 [INFO] rag.ingest: Loaded Reranker model: BAAI/bge-reranker-base
2025-10-25 21:12:11,725 [INFO] rag.ingest: Reranking 20 fine candidates with BAAI/bge-reranker-base...

--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---
--- (Reranked final 128-level with BAAI/bge-reranker-base) ---
[1] 1587401677_BA(H)-Psc-Eco-Eng-BA(P)-II-Natural_Resource.pdf | chunk_size=128
  chunk_id=chunk-63666a816fd7af7f
  parent_chunk=chunk-8aabd4e5f06bcf4e
  parent_doc=doc-7b887870a0f03587
  . mining major effects of mining operations on forest and tribal people are: • mining from shallow deposits is done by surface ...

[2] Geography of Natural Reso

In [None]:
cfg = IngestConfig(
    root_dir=Path(r"C:\Users\vsai2\Documents\LAMA\Major Project\Vals\Docs"),
    subject="Geography - Natural Resources",
    models=("all-mpnet-base-v2","BAAI/bge-base-en-v1.5"), 
    chunk_sizes=(2048, 512, 128),
    reranker_model=None 
)

In [None]:
model_name = cfg.models[0]
embed_mgr = EmbeddingManager([model_name]) 
query = "Impact of mining on the environment"

reranker = None
if cfg.reranker_model:
    if not _CROSS_ENCODER_OK:
        logger.warning("CrossEncoder not found. Reranking will be disabled.")
    else:
        reranker = Reranker(cfg.reranker_model)
        if not reranker.model:
            reranker = None 

try:
    hits = hierarchical_search(
        query=query,
        indexer=indexer,
        embed_mgr=embed_mgr,
        model_name=model_name,
        reranker=reranker,       
        top_k_parent=3,           
        top_k_intermediate=5,     
        top_k_final=5,            
    )
    
    print("\n--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---")
    if reranker:
        print(f"--- (Reranked final 128-level with {cfg.reranker_model}) ---")
    else:
        print("--- (No Reranker Used - Fallback to Vector Search) ---")

    for i, d in enumerate(hits):
        print(f"[{i+1}] {d.metadata.get('source', '?')} | chunk_size={d.metadata.get('chunk_size')}")
        print(f"  chunk_id={d.metadata.get('chunk_id')}")
        print(f"  parent_chunk={d.metadata.get('parent_chunk_id')}")
        print(f"  parent_doc={d.metadata.get('parent_doc_id')}")
        print("  " + d.page_content[:250].replace("\n", " "), "...\n")
        
except ValueError as e:
    logger.error(f"Search failed: {e}")
except Exception as e:
    logger.error(f"An unexpected error occurred during search: {e}")

2025-10-25 21:12:22,736 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:12:22,737 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: all-mpnet-base-v2
2025-10-25 21:12:27,375 [INFO] rag.ingest: No reranker. Using vector similarity for final re-rank.

--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---
--- (No Reranker Used - Fallback to Vector Search) ---
[1] Geography of Natural Resource and Management course code GeEs1019.pdf | chunk_size=128
  chunk_id=chunk-cbc9641e7d6a10cd
  parent_chunk=chunk-068d2c0b45e01b6d
  parent_doc=doc-a38890677bc460cd
  . physical changes and pollution of land, soil, water and air associated with mining operations, directly or indirectly affects ...

[2] Geography of Natural Resource and Management course code GeEs1019.pdf | chunk_size=128
  chunk_id=chunk-9a6d12bf11b9688b
  parent_chunk=chunk-07f74009d1e0d31c
  parent_doc=doc-a38890677bc460cd
  . d. social impacts associat

In [None]:
model_name = cfg.models[1]
embed_mgr = EmbeddingManager([model_name]) 
query = "Impact of mining on the environment"

reranker = None
if cfg.reranker_model:
    if not _CROSS_ENCODER_OK:
        logger.warning("CrossEncoder not found. Reranking will be disabled.")
    else:
        reranker = Reranker(cfg.reranker_model)
        if not reranker.model:
            reranker = None 

try:
    hits = hierarchical_search(
        query=query,
        indexer=indexer,
        embed_mgr=embed_mgr,
        model_name=model_name,
        reranker=reranker,       
        top_k_parent=3,           
        top_k_intermediate=5,     
        top_k_final=5,           
    )
    
    print("\n--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---")
    if reranker:
        print(f"--- (Reranked final 128-level with {cfg.reranker_model}) ---")
    else:
        print("--- (No Reranker Used - Fallback to Vector Search) ---")

    for i, d in enumerate(hits):
        print(f"[{i+1}] {d.metadata.get('source', '?')} | chunk_size={d.metadata.get('chunk_size')}")
        print(f"  chunk_id={d.metadata.get('chunk_id')}")
        print(f"  parent_chunk={d.metadata.get('parent_chunk_id')}")
        print(f"  parent_doc={d.metadata.get('parent_doc_id')}")
        print("  " + d.page_content[:250].replace("\n", " "), "...\n")
        
except ValueError as e:
    logger.error(f"Search failed: {e}")
except Exception as e:
    logger.error(f"An unexpected error occurred during search: {e}")

2025-10-25 21:12:38,029 [INFO] sentence_transformers.SentenceTransformer: Use pytorch device_name: cpu
2025-10-25 21:12:38,030 [INFO] sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
2025-10-25 21:12:42,294 [INFO] rag.ingest: No reranker. Using vector similarity for final re-rank.

--- TOP-DOWN HIERARCHICAL SEARCH (2048->512->128) ---
--- (No Reranker Used - Fallback to Vector Search) ---
[1] 1587401677_BA(H)-Psc-Eco-Eng-BA(P)-II-Natural_Resource.pdf | chunk_size=128
  chunk_id=chunk-63666a816fd7af7f
  parent_chunk=chunk-8aabd4e5f06bcf4e
  parent_doc=doc-7b887870a0f03587
  . mining major effects of mining operations on forest and tribal people are: • mining from shallow deposits is done by surface ...

[2] Geography of Natural Resource and Management course code GeEs1019.pdf | chunk_size=128
  chunk_id=chunk-df1407a7c097d0d0
  parent_chunk=chunk-f9b6e5a220e8e44c
  parent_doc=doc-a38890677bc460cd
  . for example mining activity causes