In [1]:
import os
from dotenv import load_dotenv

from google import genai
from google.genai import types
from google.api_core import retry
genai.__version__

# import chromadb
#from chromadb import Documents, EmbeddingFunction, Embeddings , Client

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document

from IPython.display import Markdown, display
from ebooklib import epub
from pprint import pprint
from bs4 import BeautifulSoup
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")
client = genai.Client(api_key=google_api_key)

In [3]:
for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp
models/gemini-embedding-001


In [13]:
def Chapter_Book(body_soup):
    """Extract text from paragraphs with Chapter_Book class"""
    chapter_book_paras = body_soup.find_all('p', class_='Chapter_Book')
    texts = []
    for para in chapter_book_paras:
        text = para.get_text(strip=True)
        if text:  # Only add non-empty text
            texts.append(text)
            # print(f"Chapter_Book: {text}")
    return "".join(texts)

# def Chapter_Quote(body_soup):
#     """Extract text from paragraphs with Chapter_Quote class"""
#     all_paras = body_soup.find_all('p')
#     chapter_quote_paras = []
    
#     for para in all_paras:
#         para_classes = para.get('class', [])
#         # Check if any class contains "Quote"
#         if any('Quote' in cls for cls in para_classes):
#             chapter_quote_paras.append(para)
#     texts = []
#     for para in chapter_quote_paras:
#         # print(chapter_quote_paras)
#         text = para.get_text(strip=True)
#         if text:  # Only add non-empty text
#             texts.append(text)
#             # print(f"Chapter_Quote: {text}\n")
#     return " ".join(texts)

def Subtitle(body_soup):
    """Extract text from paragraphs with Subtitle class"""
    subtitle_paras = body_soup.find_all('p', class_='Subtitle')
    texts = []
    for para in subtitle_paras:
        text = para.get_text(strip=True)
        if text:  # Only add non-empty text
            texts.append(text)
            # print(f"Subtitle: {text}\n")
    return "".join(texts)

In [14]:
def epub_to_documents(epub_path):
    book = epub.read_epub(epub_path)
    documents = []
    for item in book.get_items():
        if "content" in item.get_name().lower():
            chapter_content = item.content.decode('utf-8')
            soup = BeautifulSoup(chapter_content, 'html.parser')
            body_soup = soup.body
            body_text = body_soup.get_text(separator='\n', strip=True)
            metadata = {}

            # classes_to_find = ['Chapter_Book', 'class_']
            if body_soup.find_all('p', class_='Chapter_Book'):
                metadata['Type'] = 'Chapter'
                metadata['Chapter_Name'] = Chapter_Book(body_soup)
                metadata['Chapter_Subtitle'] = Subtitle(body_soup)

            elif body_soup.find_all('p', class_='Chapter_Quote'):
                metadata['Type'] = 'Quote'
            
            lines = (line.strip() for line in body_text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            cleaned_text = '\n'.join(chunk for chunk in chunks if chunk)

            if cleaned_text:
                doc = Document(
                    page_content=cleaned_text,
                    metadata=metadata
                )
                documents.append(doc)
    return documents

In [None]:
sys.exit()

In [15]:
persist_directory = "My_Black_Library"
for root, dirs, files in os.walk(persist_directory):
    # root: current directory path
    # dirs: list of subdirectory names in root
    # files: list of file names in root
    for file in files:
        if file.endswith(".epub"):
            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")
            documents_from_epub = epub_to_documents(file_path)
            print(f"Processed {len(documents_from_epub)} chapters/documents from the book.")
            break
            book = epub.read_epub(os.path.join(root, file))
            for item in book.get_items():
                if item.get_type() == epub.ITEM_DOCUMENT:  # This is a chapter
                    print(f"Chapter ID: {item.id}, Title: {item.title}")
                    # You can then process the content of the chapter:
                    # chapter_content = item.content.decode('utf-8')
                    # print(chapter_content)


Processing file: My_Black_Library/Leviathan-eBook-Eng-2023.epub
Processed 48 chapters/documents from the book.


  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):


In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=150
)
chunks = text_splitter.split_documents(documents_from_epub)
print(f"Split the book into {len(chunks)} chunks with metadata.")

Split the book into 765 chunks with metadata.


In [17]:
for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_location"] = f"Chunk_{i+1}_of_{len(chunks)}"

In [18]:
print("Generating embeddings and creating ChromaDB store...")
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

Generating embeddings and creating ChromaDB store...


In [19]:
db = Chroma.from_documents(
    chunks, 
    embedding_model, 
    persist_directory=persist_directory
)
print(f"Vector store created and saved to '{persist_directory}'.")

Vector store created and saved to 'My_Black_Library'.


In [20]:
loaded_db = db
# loaded_db = Chroma(
#     persist_directory=persist_directory, 
#     embedding_function=embedding_model
# )

# Perform a similarity search
query = "What is the main character's motivation?"
results = loaded_db.similarity_search(query, k=3)

In [21]:
print(f"\nTop results for query: '{query}'")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    # print(f"**Chapter**: {doc.metadata.get('chapter_title', 'N/A')}")
    # print(f"**Location**: {doc.metadata.get('chunk_location', 'N/A')}")
    print(f"**Content**: {doc.page_content[:250]}...")


Top results for query: 'What is the main character's motivation?'

--- Result 1 ---
**Content**: ‘This is different. You know it is. We’re not just facing an angry mob this time.’ He rested his forehead on hers. Then he loosed her hand and walked over to the table. ‘Why in the name of the Emperor did they choose me as governor? I’ve never sought...

--- Result 2 ---
**Content**: greatest
strength? He thought back over everything he had read, from Guilliman’s masterwork, the Codex Astartes, to other military texts and obscure meditations on the vagaries of the warp. He realised that, for once, he could not easily answer. Seve...

--- Result 3 ---
**Content**: Abarim relaxed as his mind settled on the correct answer. ‘My greatest strength is reason. The power to make a choice. My body could be broken. My etheric powers could be nulled. Tactics can fail. But whatever befalls me, I will always have the power...


In [None]:
sys.exit()

In [None]:
# Define a helper to retry when per-minute quota is reached.
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})


class GeminiEmbeddingFunction(chromadb.EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

In [None]:
DB_NAME = "Apoorv_40k_find_reference_db"

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

db.add(documents=documents, ids=[str(i) for i in range(len(documents))])