# All 23 Books - Vector Database

**INSTRUCTIONS: Run ALL cells in order from top to bottom**
Runtime: ~30 minutes | Cost: ~$1.50

In [1]:
# Imports
import os
import json
import re
import time
from typing import List, Dict

import openai
import tiktoken
import chromadb
from tqdm import tqdm
from dotenv import load_dotenv

print("✅ Step 1: Imports loaded")

✅ Step 1: Imports loaded


In [2]:
# Configuration
BASE_DIR = r"C:\Users\DELL\Documents\gesha_la_rag"
EXTRACTED_TEXT_DIR = os.path.join(BASE_DIR, "extracted_text")
EMBEDDINGS_DIR = os.path.join(BASE_DIR, "embeddings")
VECTORDB_DIR = os.path.join(BASE_DIR, "vector_db")

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(VECTORDB_DIR, exist_ok=True)

# Use unique collection name
COLLECTION_NAME = f"all_books_{int(time.time())}"

print(f"✅ Step 2: Configuration set")
print(f"   Collection: {COLLECTION_NAME}")

✅ Step 2: Configuration set
   Collection: all_books_1766864466


In [3]:
# Load API key
load_dotenv(os.path.join(BASE_DIR, ".env"))
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
encoding = tiktoken.get_encoding("cl100k_base")

print("✅ Step 3: API key loaded")

✅ Step 3: API key loaded


In [4]:
# Chunking function
def chunk_text(text: str, max_tokens: int = 4000, overlap_ratio: float = 0.33) -> List[str]:
    """Split text with 1/3 overlap, handling long paragraphs."""
    overlap_tokens = int(max_tokens * overlap_ratio)
    paragraphs = re.split(r'\n\s*\n', text)
    
    chunks = []
    current = []
    current_tokens = 0
    
    for para in paragraphs:
        if not para.strip():
            continue
        
        para_tokens = len(encoding.encode(para))
        
        # Split long paragraphs at sentences
        if para_tokens > max_tokens:
            sentences = re.split(r'(?<=[.!?])\s+', para)
            for sent in sentences:
                sent_tokens = len(encoding.encode(sent))
                
                if current_tokens + sent_tokens > max_tokens and current_tokens > 0:
                    chunks.append("\n\n".join(current))
                    
                    # Create overlap
                    overlap = []
                    overlap_count = 0
                    for p in reversed(current):
                        p_tok = len(encoding.encode(p))
                        if overlap_count + p_tok <= overlap_tokens:
                            overlap.insert(0, p)
                            overlap_count += p_tok
                        else:
                            break
                    
                    current = overlap
                    current_tokens = overlap_count
                
                current.append(sent)
                current_tokens += sent_tokens
        else:
            if current_tokens + para_tokens > max_tokens and current_tokens > 0:
                chunks.append("\n\n".join(current))
                
                # Create overlap
                overlap = []
                overlap_count = 0
                for p in reversed(current):
                    p_tok = len(encoding.encode(p))
                    if overlap_count + p_tok <= overlap_tokens:
                        overlap.insert(0, p)
                        overlap_count += p_tok
                    else:
                        break
                
                current = overlap
                current_tokens = overlap_count
            
            current.append(para)
            current_tokens += para_tokens
    
    if current:
        chunks.append("\n\n".join(current))
    
    return chunks

print("✅ Step 4: Chunking function ready")

✅ Step 4: Chunking function ready


In [5]:
# Find all JSON files
import glob

json_files = glob.glob(os.path.join(EXTRACTED_TEXT_DIR, "*.json"))

print(f"✅ Step 5: Found {len(json_files)} books")
for f in json_files[:5]:
    print(f"   - {os.path.basename(f)}")
if len(json_files) > 5:
    print(f"   ... and {len(json_files) - 5} more")

✅ Step 5: Found 26 books
   - Clear_Light_of_Bliss.json
   - Essence-of-Vajrayana.json
   - Great-Treasury-of-Merit.json
   - Guide_to_Bodhisattva_s_Way_of_Life_2020.json
   - Heart_Jewel.json
   ... and 21 more


In [6]:
# Process all books to chunks
print("\nProcessing all books...")
all_chunks = []

for json_path in tqdm(json_files, desc="Loading books"):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    book_title = data.get('book_title', 'Unknown')
    
    for chapter in data.get('chapters', []):
        text = chapter.get('content', '')
        if not text.strip():
            continue
        
        text_chunks = chunk_text(text, max_tokens=4000, overlap_ratio=0.33)
        
        position_to_page = data.get('position_to_page', {})
        start_page = position_to_page.get(str(chapter.get('start_position', 0)), 1)
        
        for idx, content in enumerate(text_chunks):
            all_chunks.append({
                "text": content,
                "metadata": {
                    "book_title": book_title,
                    "creator": data.get('creator', 'Geshe Kelsang Gyatso'),
                    "chapter_title": chapter.get('chapter_title') or 'Untitled',
                    "start_page": start_page,
                    "chunk_index": idx
                }
            })

print(f"\n✅ Step 6: Created {len(all_chunks)} chunks from {len(json_files)} books")

# Verify chunk sizes
chunk_sizes = [len(encoding.encode(c["text"])) for c in all_chunks]
print(f"   Avg size: {sum(chunk_sizes)/len(chunk_sizes):.0f} tokens")
print(f"   Max size: {max(chunk_sizes)} tokens")

over_max = [s for s in chunk_sizes if s > 4000]
if over_max:
    print(f"   ⚠️ {len(over_max)} chunks over 4000 tokens (max: {max(over_max)})")


Processing all books...


Loading books: 100%|██████████| 26/26 [00:10<00:00,  2.50it/s]



✅ Step 6: Created 2119 chunks from 26 books
   Avg size: 1543 tokens
   Max size: 7230 tokens
   ⚠️ 3 chunks over 4000 tokens (max: 7230)


In [7]:
# Create embeddings (THIS TAKES ~20-25 MINUTES)
print("\nCreating embeddings...")
print(f"Processing {len(all_chunks)} chunks in batches of 10")
print("This will take approximately 20-25 minutes...")

chunks_with_embeddings = []
failed_count = 0

for i in tqdm(range(0, len(all_chunks), 10), desc="Embedding"):
    batch = all_chunks[i:i + 10]
    batch_texts = [c["text"] for c in batch]
    
    try:
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=batch_texts
        )
        
        for j, chunk in enumerate(batch):
            chunk_copy = chunk.copy()
            chunk_copy["embedding"] = response.data[j].embedding
            chunks_with_embeddings.append(chunk_copy)
        
        time.sleep(0.5)  # Rate limiting
        
    except Exception as e:
        print(f"\nError on batch {i//10}: {e}")
        failed_count += 1
        for chunk in batch:
            chunk_copy = chunk.copy()
            chunk_copy["embedding"] = None
            chunks_with_embeddings.append(chunk_copy)

successful = sum(1 for c in chunks_with_embeddings if c["embedding"] is not None)
print(f"\n✅ Step 7: Created {successful}/{len(all_chunks)} embeddings")
if failed_count > 0:
    print(f"   Failed batches: {failed_count}")


Creating embeddings...
Processing 2119 chunks in batches of 10
This will take approximately 20-25 minutes...


Embedding: 100%|██████████| 212/212 [03:15<00:00,  1.08it/s]


✅ Step 7: Created 2119/2119 embeddings





In [8]:
# Save embeddings
embeddings_path = os.path.join(EMBEDDINGS_DIR, "all_books_embeddings.json")

print(f"Saving embeddings to {embeddings_path}...")
with open(embeddings_path, 'w') as f:
    json.dump(chunks_with_embeddings, f)

print(f"✅ Step 8: Saved embeddings ({os.path.getsize(embeddings_path) / 1024 / 1024:.1f} MB)")

Saving embeddings to C:\Users\DELL\Documents\gesha_la_rag\embeddings\all_books_embeddings.json...
✅ Step 8: Saved embeddings (84.1 MB)


In [9]:
# Create ChromaDB collection
print(f"\nCreating collection: {COLLECTION_NAME}")

chroma_client = chromadb.PersistentClient(path=VECTORDB_DIR)
collection = chroma_client.create_collection(name=COLLECTION_NAME)

ids = []
documents = []
embeddings = []
metadatas = []

for i, chunk in enumerate(chunks_with_embeddings):
    if chunk["embedding"] is None:
        continue
    
    ids.append(f"chunk_{i}")
    documents.append(chunk["text"])
    embeddings.append(chunk["embedding"])
    
    # Clean metadata
    meta = chunk["metadata"].copy()
    for key, value in meta.items():
        if value is None:
            meta[key] = ""
    metadatas.append(meta)

# Add to database in batches
for i in tqdm(range(0, len(ids), 100), desc="Adding to ChromaDB"):
    end = min(i + 100, len(ids))
    collection.add(
        ids=ids[i:end],
        documents=documents[i:end],
        embeddings=embeddings[i:end],
        metadatas=metadatas[i:end]
    )

print(f"✅ Step 9: Added {len(ids)} chunks to database")


Creating collection: all_books_1766864466


Adding to ChromaDB: 100%|██████████| 22/22 [00:19<00:00,  1.12it/s]

✅ Step 9: Added 2119 chunks to database





In [10]:
# Test query 1 - Clear Light content
print("\nTest Query 1: Clear Light visualization")
print("="*70)

query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=["visualize clear light at heart center"]
).data[0].embedding

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
    print(f"\n[{i}] {meta['book_title']} - {meta['chapter_title']}")
    print(f"    Page {meta['start_page']}")
    print(f"    {doc[:150]}...")


Test Query 1: Clear Light visualization

[1] Clear Light of Bliss - Untitled
    Page 1
    Clear Light of Bliss...

[2] Mahamudra Tantra - Untitled
    Page 32
    page break When the subtle wind mounted by the mind of white appearance dissolves, the mind of red increase arises. This mind and its mounted wind are...

[3] The Oral Instructions of Mahamudra - Untitled
    Page 90
    One-pronged vajra HOW TO MEDITATE ON THE STAGE OF THE VAJRA OF VARIOUS QUALITIES WITH SEED We begin by visualizing our central channel clearly, and im...

[4] Modern Buddhism 2: Tantra - Untitled
    Page 13
    When the subtle wind mounted by the mind of white appearance dissolves, the mind of red increase arises.

This mind and its mounted wind are more subt...

[5] Modern Buddhism 2: Tantra - Untitled
    Page 13
    The very subtle wind is our own body, or continuously residing body.

The very subtle mind, or indestructible mind, is our own mind, or continuously r...


In [11]:
# Test query 2 - Cross-book search
print("\nTest Query 2: Lamrim stages")
print("="*70)

query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=["stages of the path to enlightenment lamrim"]
).data[0].embedding

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
    print(f"\n[{i}] {meta['book_title']} - {meta['chapter_title']}")
    print(f"    Page {meta['start_page']}")
    print(f"    {doc[:150]}...")


Test Query 2: Lamrim stages

[1] Joyful Path of Good Fortune - Untitled
    Page 5
    The Stages of the Path The great Buddhist monastic universities of Nalanda and Vikramashila each developed their own discourse style. According to the...

[2] New Meditation Handbook, The - Untitled
    Page 3
    Preface Buddha, the founder of Buddhism, appeared in this world in 624 bc. Just as doctors give different medicine for people with different illnesses...

[3]  - Untitled
    Page 252
    Introduction Developing the realizations of the stages of the path to enlightenment depends upon four things: accumulating merit, purifying negativiti...

[4] Joyful Path of Good Fortune - Untitled
    Page 523
    Introduction Developing the realizations of the stages of the path to enlightenment depends upon four things: accumulating merit, purifying negativiti...

[5] Joyful Path of Good Fortune - Untitled
    Page 3
    Preface Although there are countless living beings, humans and non-humans, all are

In [12]:
# Test query 3 - Compassion
print("\nTest Query 3: Compassion practice")
print("="*70)

query_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=["how to develop compassion and bodhicitta"]
).data[0].embedding

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
    print(f"\n[{i}] {meta['book_title']} - {meta['chapter_title']}")
    print(f"    Page {meta['start_page']}")
    print(f"    {doc[:150]}...")


Test Query 3: Compassion practice

[1] Introduction to Buddhism - Untitled
    Page 49
    Becoming a Bodhisattva As already explained, the most meaningful use to which we can put our precious human life is not to attain liberation from suff...

[2] Living Meaningfully, Dying Joyfully - Untitled
    Page 94
    How then does it exist?

To understand this we need to study books such as Modern Buddhism, The New Meditation Handbook and The New Heart of Wisdom, w...

[3] How to Transform Your Life - Untitled
    Page 1
    We should not be in a hurry to see results, but instead practice patiently and sincerely.

Expecting quick results is itself based on self-cherishing ...

[4] New Meditation Handbook, The - Untitled
    Page 60
    18. Bodhichitta Bodhichitta literally means “mind of enlightenment”—bodhi is the Sanskrit word for “enlightenment” and chitta is the word for “mind.” ...

[5] The New Eight Steps to Happiness - Untitled
    Page 94
    Great Compassion Whenever I see unfortun

In [13]:
# Summary
print("\n" + "="*70)
print("COMPLETE - ALL 23 BOOKS!")
print("="*70)
print(f"Collection: {COLLECTION_NAME}")
print(f"Books processed: {len(json_files)}")
print(f"Total chunks: {len(all_chunks)}")
print(f"Embeddings created: {successful}")
print(f"Location: {VECTORDB_DIR}")
print("="*70)
print("\n✅ Full corpus ready for production use")
print("\nYou now have:")
print(f"  - clear_light_1766863876 ({101} chunks) - for Phase 2/3 work")
print(f"  - {COLLECTION_NAME} ({successful} chunks) - for full research")


COMPLETE - ALL 23 BOOKS!
Collection: all_books_1766864466
Books processed: 26
Total chunks: 2119
Embeddings created: 2119
Location: C:\Users\DELL\Documents\gesha_la_rag\vector_db

✅ Full corpus ready for production use

You now have:
  - clear_light_1766863876 (101 chunks) - for Phase 2/3 work
  - all_books_1766864466 (2119 chunks) - for full research
