 # Geshe Kelsang Gyatso Teachings Processing Script



 This script processes EPUB files of Geshe Kelsang Gyatso's teachings:

 1. Extracts text from EPUB files

 2. Splits text into chunks with metadata

 3. Creates embeddings using OpenAI's API

 4. Builds a vector database for semantic search



 Run this script once to prepare the data for use with the Explorer interface.

 ## Setup

 Install required packages and mount Google Drive

In [None]:
# SETUP - PACKAGES AND DIRECTORIES
# ===============================
print("Setting up the Geshe Kelsang Gyatso Teachings Processing Script...")

# Install required packages
!pip install openai chromadb ebooklib beautifulsoup4 tiktoken tqdm python-dotenv numpy -q

# Import necessary libraries
import os
import json
import glob
import re
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import openai
import tiktoken
import time
import logging
import numpy as np
from tqdm.notebook import tqdm
from google.colab import drive
from dotenv import load_dotenv

# Mount Google Drive to access files
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define directory structure
BASE_DIR = "/content/drive/MyDrive/master_rag"
EPUB_DIR = f"{BASE_DIR}/epub_directory"  # Using the specified subdirectory
TEXT_DIR = f"{BASE_DIR}/extracted_text"
EMBEDDINGS_DIR = f"{BASE_DIR}/embeddings"
VECTORDB_DIR = f"{BASE_DIR}/vector_db"
LOG_DIR = f"{BASE_DIR}/logs"

# Create directories
for dir_path in [BASE_DIR, TEXT_DIR, EMBEDDINGS_DIR, VECTORDB_DIR, LOG_DIR]:
    os.makedirs(dir_path, exist_ok=True)

print(f"✅ Using EPUB directory: {EPUB_DIR}")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # Console handler
        logging.FileHandler(f"{LOG_DIR}/processing.log")  # File handler
    ]
)
logger = logging.getLogger(__name__)


 ## API Key Setup

 Configure the OpenAI API key for embeddings

In [None]:
# API KEY SETUP
# ============

# Create .env file template if it doesn't exist
env_file_path = f"{BASE_DIR}/.env"
if not os.path.exists(env_file_path):
    with open(env_file_path, 'w') as f:
        f.write("""# API Keys for Geshe Kelsang Gyatso Teachings Explorer
OPENAI_API_KEY=your_openai_api_key_here
""")
    print(f"✅ Created .env template at {env_file_path}")
    print("Please edit this file to add your actual OpenAI API key before proceeding.")
else:
    print(f"✅ Found existing .env file at {env_file_path}")

# Load API key from .env file
load_dotenv(env_file_path)

# Setup OpenAI client
openai_client = None
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    print("\n⚠️ OpenAI API key not found. Please edit the .env file in this folder:")
    print(f"{BASE_DIR}")
    print("Add your OpenAI API key to the file, replacing the placeholder text.")
else:
    print("✅ OpenAI API key found")
    openai_client = openai.OpenAI(api_key=openai_api_key)

# Check if there are EPUB files in the directory
epub_files = glob.glob(f"{EPUB_DIR}/*.epub")
if len(epub_files) == 0:
    print(f"\n⚠️ No EPUB files found in {EPUB_DIR}")
    print(f"Please add EPUB files of Geshe Kelsang Gyatso's teachings to this folder.")
else:
    print(f"✅ Found {len(epub_files)} EPUB files in {EPUB_DIR}")


 ## Text Processing Functions

 Functions for extracting and processing text from EPUB files

In [None]:
# TEXT PROCESSING FUNCTIONS
# =======================

def improved_chunking(text, max_tokens=4000, overlap=200):
    """Split text into chunks at natural boundaries where possible"""
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # Split into paragraphs first
    paragraphs = re.split(r'\n\s*\n', text)
    
    chunks = []
    current_chunk = []
    current_tokens = 0
    
    for para in paragraphs:
        para_tokens = encoding.encode(para)
        para_token_count = len(para_tokens)
        
        # If adding this paragraph would exceed max_tokens
        if current_tokens + para_token_count > max_tokens and current_tokens > 0:
            # Complete this chunk
            chunk_text = "\n\n".join(current_chunk)
            chunks.append(chunk_text)
            
            # Start a new chunk with overlap
            # Find paragraphs that fit within overlap token count
            overlap_tokens = 0
            overlap_paras = []
            for prev_para in reversed(current_chunk):
                prev_tokens = len(encoding.encode(prev_para))
                if overlap_tokens + prev_tokens <= overlap:
                    overlap_paras.insert(0, prev_para)
                    overlap_tokens += prev_tokens
                else:
                    break
            
            # Reset with overlap paragraphs
            current_chunk = overlap_paras.copy()
            current_tokens = overlap_tokens
        
        # Add the paragraph to the current chunk
        current_chunk.append(para)
        current_tokens += para_token_count
    
    # Add the last chunk if there's anything left
    if current_chunk:
        chunk_text = "\n\n".join(current_chunk)
        chunks.append(chunk_text)
    
    return chunks

def extract_text_with_metadata(epub_path):
    """Extract text from EPUB while preserving metadata about source and structure"""
    logger.info(f"Extracting text from {os.path.basename(epub_path)}")
    print(f"Extracting text from {os.path.basename(epub_path)}...")
    
    try:
        book = epub.read_epub(epub_path)
        book_title = "Unknown Title"
        try:
            title_data = book.get_metadata('DC', 'title')
            if title_data and len(title_data) > 0 and len(title_data[0]) > 0:
                book_title = title_data[0][0]
        except Exception as e:
            logger.warning(f"Could not extract title: {e}")
            
        book_id = os.path.basename(epub_path).replace('.epub', '')
        
        # Extract creator if available
        creator = "Geshe Kelsang Gyatso"
        try:
            creator_data = book.get_metadata('DC', 'creator')
            if creator_data and len(creator_data) > 0 and len(creator_data[0]) > 0:
                creator = creator_data[0][0]
        except Exception as e:
            logger.warning(f"Could not extract creator: {e}")
        
        chapters = []
        # Track current position for page number estimation
        current_position = 0
        position_to_page = {}  # Map character positions to estimated page numbers
        chars_per_page = 2000  # Approximate characters per page
        
        total_items = len(list(book.get_items()))
        processed_items = 0
        
        print(f"Processing {total_items} items in {book_title}...")
        
        for item in tqdm(book.get_items(), desc="Book items", leave=False):
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                processed_items += 1
                                    
                try:
                    content = item.get_content().decode('utf-8', errors='replace')
                    soup = BeautifulSoup(content, 'html.parser')
                    
                    # Try to extract chapter/section title
                    chapter_title = None
                    heading = soup.find(['h1', 'h2', 'h3'])
                    if heading:
                        chapter_title = heading.get_text().strip()
                    
                    # Extract text content
                    text = soup.get_text()
                    text = re.sub(r'\s+', ' ', text).strip()
                    
                    if text:
                        # Calculate page numbers (estimation)
                        for i in range(0, len(text), chars_per_page):
                            position_to_page[current_position + i] = (current_position + i) // chars_per_page + 1
                        
                        # Add chapter with metadata
                        chapters.append({
                            "content": text,
                            "chapter_title": chapter_title,
                            "start_position": current_position,
                        })
                        
                        current_position += len(text)
                except Exception as e:
                    logger.error(f"Error processing item in {book_title}: {e}")
                    print(f"Error processing item in {book_title}: {str(e)[:100]}...")
        
        logger.info(f"Completed extraction for {book_title}: {len(chapters)} chapters, {current_position} characters")
        print(f"✅ Completed extraction for {book_title}: {len(chapters)} chapters")
        
        return {
            "book_id": book_id,
            "book_title": book_title,
            "creator": creator,
            "chapters": chapters,
            "position_to_page": position_to_page,
            "total_length": current_position
        }
    except Exception as e:
        logger.error(f"Error processing EPUB {epub_path}: {e}")
        print(f"❌ Error processing EPUB {epub_path}: {str(e)}")
        return None

def split_into_chunks_with_metadata(book_data, max_tokens=4000, overlap=200):
    """Split book text into chunks while preserving metadata"""
    logger.info(f"Chunking text for {book_data['book_title']}")
    print(f"Chunking text for {book_data['book_title']}...")
    
    chunks = []
    total_chapters = len(book_data["chapters"])
    
    for chapter_idx, chapter in enumerate(tqdm(book_data["chapters"], desc="Chapters", leave=False)):            
        text = chapter["content"]
        start_position = chapter["start_position"]
        chapter_title = chapter["chapter_title"]
        
        # Use improved chunking
        text_chunks = improved_chunking(text, max_tokens, overlap)
        
        # Track current position within the chapter
        current_pos = 0
        
        for i, chunk_text in enumerate(text_chunks):
            # Calculate chunk position in the book
            chunk_start_pos = start_position + current_pos
            chunk_end_pos = chunk_start_pos + len(chunk_text)
            current_pos += len(chunk_text) - (overlap if i < len(text_chunks) - 1 else 0)
            
            # Estimate page numbers
            start_page = 1
            end_page = 1
            for pos, page in book_data["position_to_page"].items():
                if pos <= chunk_start_pos:
                    start_page = page
                if pos <= chunk_end_pos:
                    end_page = page
                else:
                    break
            
            # Create chunk with metadata
            chunks.append({
                "text": chunk_text,
                "metadata": {
                    "book_id": book_data["book_id"],
                    "book_title": book_data["book_title"],
                    "chapter_title": chapter_title,
                    "start_page": start_page,
                    "end_page": end_page,
                    "chunk_index": len(chunks)
                }
            })
    
    logger.info(f"Created {len(chunks)} chunks for {book_data['book_title']}")
    print(f"✅ Created {len(chunks)} chunks for {book_data['book_title']}")
    return chunks

def create_embeddings_batch(chunks, batch_size=50):
    """Create embeddings for text chunks in batches with detailed progress tracking"""
    global openai_client
    
    if not openai_client:
        logger.error("OpenAI client not configured")
        print("❌ OpenAI client not configured. Please check your API key.")
        return []
    
    all_embeddings = []
    total_chunks = len(chunks)
    
    print(f"Creating embeddings for {total_chunks} chunks:")
    progress_bar = tqdm(total=total_chunks, desc="Embedding progress", unit="chunk")
    
    # Process in batches
    batch_count = 0
    for i in range(0, total_chunks, batch_size):
        batch_count += 1
        end_idx = min(i + batch_size, total_chunks)
        batch = chunks[i:end_idx]
        batch_size_actual = len(batch)
        
        logger.info(f"Processing batch {batch_count}: chunks {i+1}-{end_idx} of {total_chunks}")
        
        retry_count = 0
        max_retries = 5
        success = False
        
        while not success and retry_count < max_retries:
            try:
                # Extract just the text for embedding
                texts = [chunk["text"] for chunk in batch]
                
                # Create embeddings
                response = openai_client.embeddings.create(
                    model="text-embedding-3-small",
                    input=texts
                )
                
                # Add embeddings to chunks
                for j, embedding_data in enumerate(response.data):
                    chunk_with_embedding = batch[j].copy()
                    chunk_with_embedding["embedding"] = embedding_data.embedding
                    all_embeddings.append(chunk_with_embedding)
                
                # Update progress
                progress_bar.update(batch_size_actual)
                
                # Batch successful
                success = True
                logger.info(f"Successfully embedded batch {batch_count} ({batch_size_actual} chunks)")
                
                # Add delay to respect rate limits
                if end_idx < total_chunks:
                    time.sleep(0.5)
                
            except Exception as e:
                retry_count += 1
                logger.error(f"Error creating embeddings for batch {batch_count} (attempt {retry_count}/{max_retries}): {e}")
                print(f"⚠️ Error in batch {batch_count}: {str(e)[:100]}... Retrying ({retry_count}/{max_retries})")
                
                # Wait longer if we hit rate limits
                if "rate limit" in str(e).lower():
                    wait_time = 60 * retry_count  # Increase wait time with each retry
                    logger.info(f"Rate limit hit, waiting {wait_time} seconds...")
                    print(f"Rate limit hit, waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    # Other errors
                    time.sleep(5)
        
        if not success:
            logger.error(f"Failed to process batch {batch_count} after {max_retries} attempts")
            print(f"❌ Failed to process batch {batch_count} after {max_retries} attempts. Continuing with next batch.")
    
    progress_bar.close()
    logger.info(f"Embedding complete: {len(all_embeddings)}/{total_chunks} chunks embedded successfully")
    print(f"Embedding complete: {len(all_embeddings)}/{total_chunks} chunks embedded successfully")
    
    return all_embeddings

def create_vector_database(chunks_with_embeddings, collection_name="geshe_kelsang_gyatso"):
    """Create a Chroma vector database from chunks with embeddings"""
    import chromadb
    
    logger.info("Creating vector database")
    print("Creating vector database...")
    
    # Initialize ChromaDB
    chroma_client = chromadb.PersistentClient(path=VECTORDB_DIR)
    
    # Create or get collection
    try:
        # Try to get existing collection
        collection = chroma_client.get_collection(name=collection_name)
        logger.info(f"Found existing collection: {collection_name}")
        
        # Delete and recreate collection to ensure clean state
        logger.info(f"Deleting existing collection to recreate with new data")
        chroma_client.delete_collection(name=collection_name)
        collection = chroma_client.create_collection(name=collection_name)
        logger.info(f"Recreated collection: {collection_name}")
    except:
        # Create new collection if it doesn't exist
        collection = chroma_client.create_collection(name=collection_name)
        logger.info(f"Created new collection: {collection_name}")
    
    # Prepare data for insertion
    total_chunks = len(chunks_with_embeddings)
    logger.info(f"Preparing {total_chunks} chunks for database insertion")
    
    ids = [f"chunk_{chunk['metadata']['book_id']}_{chunk['metadata']['chunk_index']}" for chunk in chunks_with_embeddings]
    documents = [chunk["text"] for chunk in chunks_with_embeddings]
    embeddings = [chunk["embedding"] for chunk in chunks_with_embeddings]
    metadatas = [chunk["metadata"] for chunk in chunks_with_embeddings]
    
    # Add to collection in batches
    batch_size = 100
    total_batches = (total_chunks + batch_size - 1) // batch_size
    
    progress_bar = tqdm(total=total_batches, desc="Database insertion", unit="batch")
    
    for i in range(0, total_chunks, batch_size):
        batch_num = i // batch_size + 1
        end_idx = min(i + batch_size, total_chunks)
        
        logger.info(f"Adding batch {batch_num}/{total_batches} to vector database ({i+1}-{end_idx} of {total_chunks} chunks)")
        
        try:
            collection.add(
                ids=ids[i:end_idx],
                documents=documents[i:end_idx],
                embeddings=embeddings[i:end_idx],
                metadatas=metadatas[i:end_idx]
            )
            progress_bar.update(1)
            
        except Exception as e:
            logger.error(f"Error adding batch {batch_num} to database: {e}")
            print(f"❌ Error adding batch {batch_num} to database: {str(e)[:100]}...")
    
    progress_bar.close()
    logger.info(f"Vector database creation complete. Collection: {collection_name}")
    print(f"✅ Vector database creation complete!")
    
    return collection


 ## Main Processing Function

 The main function that processes all EPUB files

In [None]:
# MAIN PROCESSING FUNCTION
# =======================

def process_all_epubs():
    """Process all EPUB files in the directory"""
    logger.info("Starting EPUB processing")
    
    # Get list of EPUB files
    epub_files = glob.glob(f"{EPUB_DIR}/*.epub")
    
    if len(epub_files) == 0:
        msg = f"No EPUB files found in {EPUB_DIR}. Please add some EPUB files before processing."
        logger.error(msg)
        print(msg)
        return msg
    
    print(f"Found {len(epub_files)} EPUB files to process:")
    for epub_file in epub_files:
        print(f"  - {os.path.basename(epub_file)}")
    
    all_chunks = []
    
    # Process each EPUB file with progress bar
    for i, epub_file in enumerate(tqdm(epub_files, desc="Processing books", unit="book")):
        book_name = os.path.basename(epub_file).replace('.epub', '')
        logger.info(f"Processing book {i+1}/{len(epub_files)}: {book_name}")
        print(f"\nProcessing book {i+1}/{len(epub_files)}: {book_name}")
        
        # Extract text with metadata
        book_data = extract_text_with_metadata(epub_file)
        
        if not book_data:
            logger.error(f"Failed to extract text from {book_name}. Skipping.")
            print(f"❌ Failed to extract text from {book_name}. Skipping.")
            continue
        
        # Save extracted text
        text_path = f"{TEXT_DIR}/{book_name}.json"
        try:
            with open(text_path, 'w') as f:
                json.dump(book_data, f)
            logger.info(f"Saved extracted text to {text_path}")
            print(f"Saved extracted text to {text_path}")
        except Exception as e:
            logger.error(f"Error saving extracted text: {e}")
            print(f"⚠️ Error saving extracted text: {str(e)[:100]}...")
        
        # Split into chunks
        try:
            chunks = split_into_chunks_with_metadata(book_data)
            all_chunks.extend(chunks)
            logger.info(f"Created {len(chunks)} chunks from {book_name}")
            print(f"Created {len(chunks)} chunks from {book_name}")
        except Exception as e:
            logger.error(f"Error creating chunks: {e}")
            print(f"❌ Error creating chunks: {str(e)[:100]}...")
    
    # Create embeddings
    print(f"\n{'='*50}")
    logger.info(f"Starting embedding generation for {len(all_chunks)} chunks")
    print(f"Starting embedding generation for {len(all_chunks)} chunks")
    print(f"{'='*50}\n")
    
    chunks_with_embeddings = create_embeddings_batch(all_chunks)
    
    if not chunks_with_embeddings:
        logger.error("No embeddings were created. Check API key and connection.")
        return "Failed to create embeddings. Check your OpenAI API key and internet connection."
    
    # Save embeddings
    logger.info(f"Saving {len(chunks_with_embeddings)} embeddings")
    print(f"Saving {len(chunks_with_embeddings)} embeddings...")
    
    try:
        embeddings_path = f"{EMBEDDINGS_DIR}/all_embeddings.json"
        with open(embeddings_path, 'w') as f:
            # Convert numpy arrays to lists for JSON serialization
            serializable_chunks = []
            for chunk in chunks_with_embeddings:
                chunk_copy = chunk.copy()
                chunk_copy["embedding"] = chunk_copy["embedding"] if isinstance(chunk_copy["embedding"], list) else chunk_copy["embedding"].tolist()
                serializable_chunks.append(chunk_copy)
            json.dump(serializable_chunks, f)
        logger.info(f"Saved embeddings to {embeddings_path}")
        print(f"✅ Saved embeddings to {embeddings_path}")
    except Exception as e:
        logger.error(f"Error saving embeddings: {e}")
        print(f"⚠️ Error saving embeddings: {str(e)[:100]}...")
    
    # Create vector database
    print(f"\n{'='*50}")
    print("Creating vector database...")
    print(f"{'='*50}\n")
    
    try:
        collection = create_vector_database(chunks_with_embeddings)
        logger.info("Processing complete!")
        print("\n✅ Processing complete!")
        return collection
    except Exception as e:
        logger.error(f"Error creating vector database: {e}")
        print(f"❌ Error creating vector database: {str(e)[:100]}...")
        return f"Error creating vector database: {str(e)}"


 ## Execute Processing

 Run this cell to start processing all EPUB files

In [None]:
# RUN PROCESSING
# =============

# Check if there's already a vector database
import os
import glob
import chromadb

def check_existing_database():
    """Check if a vector database already exists"""
    try:
        client = chromadb.PersistentClient(path=VECTORDB_DIR)
        collection = client.get_collection(name="geshe_kelsang_gyatso")
        count = collection.count()
        if count > 0:
            return True, count
        return False, 0
    except:
        return False, 0

has_db, count = check_existing_database()
if has_db:
    print(f"⚠️ An existing vector database was found with {count} entries.")
    print("Processing again will replace this database.")
    proceed = input("Do you want to proceed with processing? (y/n): ")
    if proceed.lower() != 'y':
        print("Processing cancelled.")
        exit()

# Run the processing
print("\n" + "="*80)
print("STARTING EPUB PROCESSING")
print("="*80 + "\n")

result = process_all_epubs()

print("\n" + "="*80)
print("PROCESSING COMPLETE")
print("="*80)
print("\nYou can now use the Explorer interface to interact with the teachings.")