# Data Ingestion

In [1]:
import re
import os
import numpy as np
import chromadb
from pathlib import Path
from typing import List, Dict, Any, Tuple

from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

import warnings 
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader=TextLoader("../data/whole_bible.txt",encoding="utf-8")
document=loader.load()
print(document[0].page_content[:250])

Book 01 Genesis
001:001 In the beginning God{After "God," the Hebrew has the two letters
        "Aleph Tav" (the first and last letters of the Hebrew alphabet)
        as a grammatical marker.} created the heavens and the earth.
001:002 Now the eart


### loading text file and adding metadata to docs

In [3]:
file_path = "../data/whole_bible.txt"

In [4]:
documents = []
current_book = None
current_verse_text = ""
current_chapter = None
current_verse = None

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        # Detect book line
        if line.startswith("Book"):
            current_book = line.split(" ", 2)[2]  # 'Genesis'
            continue

        # Detect cghapter and verse line
        match = re.match(r"(\d{3}):(\d{3})\s+(.*)", line)
        if match:
            # Save previous verse if exists
            if current_verse_text:
                doc = Document(
                    page_content=current_verse_text,
                    metadata={
                        "book": current_book,
                        "chapter": current_chapter,
                        "verse": current_verse,
                        "source": f"{current_book}_{current_chapter}_{current_verse}"
                    }
                )
                documents.append(doc)
                current_verse_text = ""

            # New verse
            current_chapter = int(match.group(1))
            current_verse = int(match.group(2))
            current_verse_text = match.group(3)
        else:
            # Continuation of previous verse (multi-line)
            current_verse_text += " " + line

# Add last verse
if current_verse_text:
    doc = Document(
        page_content=current_verse_text,
        metadata={
            "book": current_book,
            "chapter": current_chapter,
            "verse": current_verse,
            "source": f"{current_book}_{current_chapter}_{current_verse}"
        }
    )
    documents.append(doc)
print(f"{len(documents)} verses loaded as Document objects with metadata.")

31102 verses loaded as Document objects with metadata.


In [5]:
for doc in documents[1000::10000]:
    print(doc)

page_content='Hamor and Shechem, his son, came to the gate of their city, and talked with the men of their city, saying,' metadata={'book': 'Genesis', 'chapter': 34, 'verse': 20, 'source': 'Genesis_34_20'}
page_content='The sons of Eliezer were:  Rehabiah the chief; and Eliezer had no other sons; but the sons of Rehabiah were very many.' metadata={'book': '1 Chronicles', 'chapter': 23, 'verse': 17, 'source': '1 Chronicles_23_17'}
page_content='Son of man, tell her, You are a land that is not cleansed, nor rained on in the day of indignation.' metadata={'book': 'Ezekiel', 'chapter': 22, 'verse': 24, 'source': 'Ezekiel_22_24'}
page_content='However much she glorified herself, and grew wanton, so much give her of torment and mourning.  For she says in her heart, 'I sit a queen, and am no widow, and will in no way see mourning.'' metadata={'book': 'Revelation', 'chapter': 18, 'verse': 7, 'source': 'Revelation_18_7'}


## Embedding and Vectorstore

In [6]:
class EmbeddingManager:
    """Handles document embedding generation using SentenceTransformer"""
    
    def __init__(self, model_name: str = "all-mpnet-base-v2"):
        """
        Initialize the embedding manager
        
        Args:
            model_name: HuggingFace model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the SentenceTransformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            numpy array of embeddings with shape (len(texts), embedding_dim)
        """
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

## Handling duplicated bible verses

In [7]:
def deduplicate_documents(documents: List[Document]) -> List[Document]:
    seen = {}
    unique_docs = []
    
    for doc in documents:
        verse_id = doc.metadata['source']
        if verse_id not in seen:
            seen[verse_id] = True
            unique_docs.append(doc)
    
    print(f"Removed {len(documents) - len(unique_docs)} duplicates")
    return unique_docs

documents = deduplicate_documents(documents)

Removed 16 duplicates


In [8]:
embedding_manager=EmbeddingManager()
texts = [doc.page_content for doc in documents]
embeddings = embedding_manager.generate_embeddings(texts)

Loading embedding model: all-mpnet-base-v2
Model loaded successfully. Embedding dimension: 768
Generating embeddings for 31086 texts...
Generated embeddings with shape: (31086, 768)


### Create VectorStore

In [10]:
class VectorStore:
    """
    Manages Bible verse embeddings in a ChromaDB vector store
    """
    def __init__(
        self,
        collection_name: str = "bible_web_nt",
        persist_directory: str = "../data/vector_store"
    ):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        os.makedirs(self.persist_directory, exist_ok=True)
        self.client = chromadb.PersistentClient(
            path=self.persist_directory
        )
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={
                "description": "World English Bible verses for Bible study chatbot",
                "translation": "WEB",
                "granularity": "verse"
            }
        )
        print(f"Vector store ready: {self.collection_name}")
        print(f"Existing verses: {self.collection.count()}")
    
    def _make_id(self, doc: Document) -> str:
        """
        Create a deterministic ID so verses don't duplicate on re-runs
        Example: Genesis_1_1
        """
        meta = doc.metadata
        return f"{meta['book']}_{meta['chapter']}_{meta['verse']}"
    
    def add_documents(
        self,
        documents: List[Document],
        embeddings: np.ndarray,
        batch_size: int = 5000
    ):
        """
        Add documents to the vector store in batches
        
        Args:
            documents: List of Document objects
            embeddings: Numpy array of embeddings
            batch_size: Number of documents to process at once (default: 5000)
        """
        if len(documents) != len(embeddings):
            raise ValueError("Documents and embeddings count must match")
        
        total_docs = len(documents)
        print(f"Processing {total_docs} documents in batches of {batch_size}...")
        
        # Process in batches
        for i in range(0, total_docs, batch_size):
            batch_end = min(i + batch_size, total_docs)
            batch_docs = documents[i:batch_end]
            batch_embeddings = embeddings[i:batch_end]
            
            print(f"\nBatch {i//batch_size + 1}: Processing verses {i+1} to {batch_end}")
            
            ids = []
            metadatas = []
            texts = []
            vectors = []
            
            for doc, embedding in zip(batch_docs, batch_embeddings):
                verse_id = self._make_id(doc)
                ids.append(verse_id)
                texts.append(doc.page_content)
                vectors.append(embedding.tolist())
                metadatas.append({
                    "book": doc.metadata.get("book"),
                    "chapter": doc.metadata.get("chapter"),
                    "verse": doc.metadata.get("verse"),
                    "source": doc.metadata.get("source", verse_id),
                    "translation": "WEB",
                    "content_length": len(doc.page_content)
                })
            
            try:
                self.collection.upsert(
                    ids=ids,
                    documents=texts,
                    embeddings=vectors,
                    metadatas=metadatas
                )
                print(f"Added {len(ids)} verses to store")
                
            except Exception as e:
                print(f"Error adding batch {i//batch_size + 1}")
                raise e
        
        final_count = self.collection.count()
        print(f"\n{'='*50}")
        print(f"Complete! Total verses in store: {final_count}")
        print(f"{'='*50}")
    
    def search(
        self,
        query_embedding: np.ndarray,
        n_results: int = 5,
        filter_dict: dict = None
    ):
        """
        Search for similar verses
        
        Args:
            query_embedding: Embedding vector for the query
            n_results: Number of results to return
            filter_dict: Optional metadata filter (e.g., {"book": "Genesis"})
        
        Returns:
            Dictionary with ids, documents, metadatas, and distances
        """
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=n_results,
            where=filter_dict
        )
        return results

In [11]:
vectorstore=VectorStore()
vectorstore.add_documents(documents, embeddings)

Vector store ready: bible_web_nt
Existing verses: 0
Processing 31086 documents in batches of 5000...

Batch 1: Processing verses 1 to 5000
Added 5000 verses to store

Batch 2: Processing verses 5001 to 10000
Added 5000 verses to store

Batch 3: Processing verses 10001 to 15000
Added 5000 verses to store

Batch 4: Processing verses 15001 to 20000
Added 5000 verses to store

Batch 5: Processing verses 20001 to 25000
Added 5000 verses to store

Batch 6: Processing verses 25001 to 30000
Added 5000 verses to store

Batch 7: Processing verses 30001 to 31086
Added 1086 verses to store

Complete! Total verses in store: 31086


# Retriever pipeline from Vector Store

In [18]:
class RAGRetriever:
    """Handles query-based retrieval from the vector store"""
    
    def __init__(self, vector_store, embedding_manager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
    
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """Semantic search for topical queries"""
        print(f"ðŸ“– Semantic search for: '{query}'")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    similarity_score = 1 - distance
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"âœ“ Found {len(retrieved_docs)} relevant verses")
            else:
                print("âœ— No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"âœ— Error during retrieval: {e}")
            return []
    
    def retrieve_with_filter(
        self,
        query: str,
        book: str = None,
        chapter: int = None,
        top_k: int = 5
    ) -> List[Dict[str, Any]]:
        """
        Semantic search with metadata filtering (e.g., search 'love' only in John)
        """
        print(f"ðŸ“– Semantic search for '{query}' in {book}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            # Build where filter
            where_conditions = []
            if book:
                where_conditions.append({"book": {"$eq": book}})
            if chapter:
                where_conditions.append({"chapter": {"$eq": chapter}})
            
            where_filter = None
            if len(where_conditions) == 1:
                where_filter = where_conditions[0]
            elif len(where_conditions) > 1:
                where_filter = {"$and": where_conditions}
            
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                where=where_filter
            )
            
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(
                    zip(ids, documents, metadatas, distances)
                ):
                    similarity_score = 1 - distance
                    retrieved_docs.append({
                        'id': doc_id,
                        'content': document,
                        'metadata': metadata,
                        'similarity_score': similarity_score,
                        'distance': distance,
                        'rank': i + 1
                    })
                
                print(f"âœ“ Found {len(retrieved_docs)} relevant verses")
            else:
                print("âœ— No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"âœ— Error during filtered retrieval: {e}")
            return []
    
    def retrieve_by_reference(
        self, 
        book: str, 
        chapter: int = None, 
        verse: int = None,
        top_k: int = 50
    ) -> List[Dict[str, Any]]:
        """
        Retrieve verses by biblical reference (Issue 2 solution)
        
        Args:
            book: Book name (e.g., "Genesis")
            chapter: Chapter number (optional)
            verse: Verse number (optional)
            top_k: Maximum results to return
        """
        ref_str = f"{book}"
        if chapter: ref_str += f" {chapter}"
        if verse: ref_str += f":{verse}"
        print(f"Looking up reference: {ref_str}")
        
        try:
            # Build ChromaDB where filter with proper $and operator
            where_conditions = [{"book": {"$eq": book}}]
            
            if chapter is not None:
                where_conditions.append({"chapter": {"$eq": chapter}})
            if verse is not None:
                where_conditions.append({"verse": {"$eq": verse}})
            
            # Use $and if multiple conditions
            if len(where_conditions) == 1:
                where_filter = where_conditions[0]
            else:
                where_filter = {"$and": where_conditions}
            
            results = self.vector_store.collection.get(
                where=where_filter,
                limit=top_k
            )
            
            retrieved_docs = []
            if results['documents']:
                for i, (doc_id, document, metadata) in enumerate(
                    zip(results['ids'], results['documents'], results['metadatas'])
                ):
                    retrieved_docs.append({
                        'id': doc_id,
                        'content': document,
                        'metadata': metadata,
                        'rank': i + 1
                    })
                
                print(f"Found {len(retrieved_docs)} verse(s)")
            else:
                print("No verses found for that reference")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during reference retrieval: {e}")
            return []


class SmartBibleRetriever:
    """
    Smart retrieval combining reference parsing with semantic search (Issue 3 solution)
    This is your main interface!
    """
    
    def __init__(self, rag_retriever: RAGRetriever):
        self.rag_retriever = rag_retriever
        
        # Common Bible book variations
        self.book_mappings = {
            'genesis': 'Genesis', 'gen': 'Genesis',
            'exodus': 'Exodus', 'ex': 'Exodus', 'exod': 'Exodus',
            'matthew': 'Matthew', 'matt': 'Matthew', 'mt': 'Matthew',
            'john': 'John', 'jn': 'John',
            '1 john': '1 John', '2 john': '2 John', '3 john': '3 John',
            'revelation': 'Revelation', 'rev': 'Revelation',
            'psalms': 'Psalms', 'psalm': 'Psalms', 'ps': 'Psalms',
            'romans': 'Romans', 'rom': 'Romans',
            'acts': 'Acts',
            # Add more as needed
        }
    
    def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Smart retrieval that automatically detects query type
        
        Args:
            query: User's search query
            top_k: Number of results to return
        
        Returns:
            List of relevant verses
        """
        # Check for book-specific topical query (e.g., "love in John")
        book_filter = self._extract_book_filter(query)
        
        # Try to parse as biblical reference first
        ref = self._parse_reference(query)
        
        if ref:
            # Use metadata filtering for specific references
            return self.rag_retriever.retrieve_by_reference(
                book=ref['book'],
                chapter=ref.get('chapter'),
                verse=ref.get('verse'),
                top_k=top_k
            )
        elif book_filter:
            # Topical search within a specific book
            return self.rag_retriever.retrieve_with_filter(
                query=query,
                book=book_filter,
                top_k=top_k
            )
        else:
            # Use semantic search for general topical queries
            return self.rag_retriever.retrieve(query, top_k=top_k)
    
    def _extract_book_filter(self, query: str) -> str:
        """
        Extract book name from queries like 'love in John' or 'faith from Romans'
        """
        query_lower = query.lower()
        
        # Patterns like "from the book of john", "in john", "from john"
        patterns = [
            r'from\s+(?:the\s+book\s+of\s+)?(\d?\s?[a-z]+)',
            r'in\s+(?:the\s+book\s+of\s+)?(\d?\s?[a-z]+)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, query_lower)
            if match:
                book_raw = match.group(1).strip()
                book = self._normalize_book_name(book_raw)
                if book:
                    return book
        
        return None
    
    def _parse_reference(self, query: str) -> Dict[str, Any]:
        """
        Parse biblical references from natural language
        
        Handles patterns like:
        - "Genesis 1:1"
        - "John 3:16"
        - "1 John 2:3"
        - "Genesis chapter 1 verse 1"
        - "What does Genesis 1:1 say?"
        
        Returns:
            {'book': str, 'chapter': int, 'verse': int} or None
        """
        query_lower = query.lower()
        
        # Pattern 1: "genesis 1:1" or "1 john 3:16"
        pattern1 = r'(\d?\s?[a-z]+)\s+(\d+)[:\s]+(\d+)'
        match = re.search(pattern1, query_lower)
        
        if match:
            book_raw = match.group(1).strip()
            book = self._normalize_book_name(book_raw)
            if book:
                return {
                    'book': book,
                    'chapter': int(match.group(2)),
                    'verse': int(match.group(3))
                }
        
        # Pattern 2: "genesis chapter 1 verse 1"
        pattern2 = r'(\d?\s?[a-z]+)\s+chapter\s+(\d+)\s+verse\s+(\d+)'
        match2 = re.search(pattern2, query_lower)
        
        if match2:
            book_raw = match2.group(1).strip()
            book = self._normalize_book_name(book_raw)
            if book:
                return {
                    'book': book,
                    'chapter': int(match2.group(2)),
                    'verse': int(match2.group(3))
                }
        
        # Pattern 3: Just book and chapter "genesis 1" or "john 3"
        pattern3 = r'(\d?\s?[a-z]+)\s+(\d+)(?:\D|$)'
        match3 = re.search(pattern3, query_lower)
        
        if match3:
            book_raw = match3.group(1).strip()
            book = self._normalize_book_name(book_raw)
            if book:
                return {
                    'book': book,
                    'chapter': int(match3.group(2))
                }
        
        return None
    
    def _normalize_book_name(self, book_raw: str) -> str:
        """Convert various book name formats to standard form"""
        book_lower = book_raw.lower().strip()
        
        # Check mappings
        if book_lower in self.book_mappings:
            return self.book_mappings[book_lower]
        
        # Try title case as fallback
        return book_raw.strip().title()
    
    def format_results(self, results: List[Dict[str, Any]]) -> str:
        """Pretty print results for display"""
        if not results:
            return "No verses found."
        
        output = []
        for result in results:
            meta = result['metadata']
            reference = f"{meta['book']} {meta['chapter']}:{meta['verse']}"
            content = result['content']
            
            if 'similarity_score' in result:
                score = result['similarity_score']
                output.append(f"{reference} (similarity: {score:.2f})\n{content}\n")
            else:
                output.append(f"{reference}\n{content}\n")
        
        return "\n".join(output)

In [20]:
rag_retriever = RAGRetriever(vectorstore, embedding_manager)
smart_retriever = SmartBibleRetriever(rag_retriever)

# Example queries
print("=" * 60)

# Reference-based queries (uses metadata filtering)
results = smart_retriever.retrieve("What does Genesis 1:1 say?")
print(smart_retriever.format_results(results))

print("=" * 60)

results = smart_retriever.retrieve("John 3 16")
print(smart_retriever.format_results(results))

print("=" * 60)

# Topical queries (uses semantic search)
results = smart_retriever.retrieve("verses about faith")
print(smart_retriever.format_results(results))

Looking up reference: Genesis 1:1
Found 1 verse(s)
Genesis 1:1
In the beginning God{After "God," the Hebrew has the two letters "Aleph Tav" (the first and last letters of the Hebrew alphabet) as a grammatical marker.} created the heavens and the earth.

Looking up reference: John 3:16
Found 1 verse(s)
John 3:16
For God so loved the world, that he gave his one and only Son, that whoever believes in him should not perish, but have eternal life.

ðŸ“– Semantic search for: 'verses about faith'
Generating embeddings for 1 texts...
Generated embeddings with shape: (1, 768)
âœ“ Found 5 relevant verses
James 2:22 (similarity: 0.37)
You see that faith worked with his works, and by works faith was perfected;

Romans 10:17 (similarity: 0.29)
So faith comes by hearing, and hearing by the word of God.

Romans 1:17 (similarity: 0.28)
For in it is revealed God's righteousness from faith to faith. As it is written, "But the righteous shall live by faith."{Habakkuk 2:4}

Luke 17:5 (similarity: 0.27)
Th