In [18]:
import os
from dotenv import load_dotenv 
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from tavily import TavilyClient

In [19]:
from ebooklib import epub 

load_dotenv(dotenv_path="/app/.env")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

#intiialize LLM
llm = ChatOpenAI(
    model="gpt-5-nano"
)

In [20]:
tavily_client = TavilyClient()

def get_search(topic: str) -> str: 
    """ Performs a web search on a certain topic """
    response = tavily_client.search(topic)
    return response

def generate_response(prompt):
    msg = ChatPromptTemplate.from_messages([ 
        ("system", "You are a helpful assistant that will help summarize blocks of info from a given search topic"), 
        ("user", "{prompt}") ])    
    #dont forget this step 
    formatted_msg = msg.format_messages(prompt=prompt)
    result = llm.invoke(formatted_msg) 
    return result.content

In [21]:
#search_query = get_search("tell me about the history of south korea")
#generate llm response 
#answer = generate_response(search_query)
#print(answer)


In [86]:
book_directories = []

In [87]:
#test file searching
from bs4 import BeautifulSoup
import json 

#trawl the non fiction directory 
def crawl_directories(root):
    for folders in os.listdir(root):
        folder_path = os.path.join(root, folders)
        if not os.path.isdir(folder_path):
            continue
            
        for files in os.listdir(folder_path):
            file_path = os.path.join(folder_path, files)
            if not files.lower().endswith(".epub"):
                continue
            try:
                nihao = epub.read_epub(file_path)
                title = nihao.get_metadata("DC", "title")
                author = nihao.get_metadata("DC", "creator")
                description = nihao.get_metadata("DC", "description")
                if len(description) == 0:
                    desc = generate_description(title, author)
                    nihao.add_metadata("DC", "description", desc)
                    epub.write_epub(file_path, nihao)

                if file_path not in book_directories:
                    add_to_array(file_path)
            except Exception as e:
                #identify the corrupt epubs
                print("BAD EPUB:", file_path, "=>", e)
                continue

def add_to_array(file_path):
    book_directories.append(file_path)
    print(file_path, "appended")
    return

#if the description is missing, fill it out
def generate_description(title, author):
    prompt = ChatPromptTemplate([
        ("system", """You are a professional summarizer well versed in all the books in the world. You are able to recall
        any information about a book. Your task will be to add in a description of what the book is about in less than 120 words.
        You should not spoil too much about the book, and only fill in the parts to let any reader understand what they are 
        about to read.

        Only return the summary and nothing else. Do not introduce yourself, and do not ask any other questions. Your job is only to print out 
        the description, and the description only. Do not mention the author or the title in your answer.
        
        Description:
        """),
        ("user", "{title}, {author}")
    ])

    formatted_prompt = prompt.format_messages(title=title, author=author)
    result = llm.invoke(formatted_prompt)
    return result.content

    
    

In [88]:
#testing
#generate_description("i, robot", "isaac asimov")

In [89]:
BOOK_DIR = "/ebooks"
OUTPUT= "/output"
FICTION = BOOK_DIR+"/Fiction"
NON_FICTION = BOOK_DIR+"/Non-Fiction"




In [90]:
#TRAWL DAT 
#crawl_directories(NON_FICTION)
#crawl_directories(FICTION)

In [91]:
#trawl the directory 
def read_books(root):
    for folders in os.listdir(root):
        folder_path = os.path.join(root, folders)
        if not os.path.isdir(folder_path):
            continue
            
        for files in os.listdir(folder_path):
            file_path = os.path.join(folder_path, files)
            if not files.lower().endswith(".epub"):
                continue
            try:
                nihao = epub.read_epub(file_path)
                title = nihao.get_metadata("DC", "title")
                author = nihao.get_metadata("DC", "creator")
                description = nihao.get_metadata("DC", "description")

                if file_path not in book_directories:
                    add_to_array(file_path)
            except Exception as e:
                #identify the corrupt epubs
                print("BAD EPUB:", file_path, "=>", e)
                continue

    

In [92]:
read_books(NON_FICTION)
#read_books(FICTION)

/ebooks/Non-Fiction/Biography/Abraham Lincoln - The Life and Writings of Abraham Lincoln.epub appended
/ebooks/Non-Fiction/Biography/Walter Isaacson - Benjamin Franklin An American Life.epub appended
/ebooks/Non-Fiction/Biography/Vincent Cronin - Napoleon Bonaparte- Una biografia intima.epub appended
/ebooks/Non-Fiction/Biography/Walter Isaacson - Steve Jobs The Authorized Biography.epub appended
/ebooks/Non-Fiction/Biography/Blaine Harden - Escape from Camp 14 One Man's Remarkable Odyssey from North Korea to Freedom in the West.epub appended
/ebooks/Non-Fiction/Biography/Joseph Kim - Under the Same Sky- From Starvation in North Korea to Salvation in America.epub appended
/ebooks/Non-Fiction/Biography/Kathleen Dalton - Theodore Roosevelt A Strenuous Life.epub appended
/ebooks/Non-Fiction/Biography/Walter Isaacson - Elon Musk.epub appended
/ebooks/Non-Fiction/Biography/Benjamin Franklin - The Autobiography of Benjamin F..epub appended
/ebooks/Non-Fiction/Biography/William Manchester 

In [96]:
#prepare for encoding and chunking
import html2text

#convert all epubs to text, add to array 

def epub_to_text(path: str) -> str:
    book = epub.read_epub(path)
    texts = []

    #configure html2text
    text_client = html2text.HTML2Text()
    text_client.ignore_links = True
    text_client.ignore_images = True
    text_client.skip_internal_links = True

    for item in book.get_items():
        if isinstance(item, epub.EpubHtml):
            #1 get HTML
            html = item.get_content().decode("utf-8", errors="ignore")
            #2 convert to markdown
            markdown_content = text_client.handle(html)
            texts.append(markdown_content)

    return "\n\n".join(texts)



In [97]:
all_texts = []
for path in book_directories:
    text = epub_to_text(path)
    all_texts.append(text)

In [98]:
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore

In [99]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

In [100]:
#chunk dat data
all_chunks = []

for path in book_directories:
    text = epub_to_text(path)
    chunks = splitter.split_text(text)

    #associate metadata
    for chunk in chunks:
        all_chunks.append({
            "text": chunk,
            "source": path
        })

In [72]:
#chroma 
import chromadb
from langchain_community.vectorstores import Chroma

embeddings = OpenAIEmbeddings()

#create chroma
vectorstore = Chroma(
    collection_name="ebook_texts", 
    embedding_function=embeddings,
    persist_directory="/outputs/chroma_db"
)


In [None]:
vectorstore.add_texts(
    texts=[c["text"] for c in all_chunks],
    metadatas=[{"source": c["source"]} for c in all_chunks]
)

In [None]:
texts=[c["text"] for c in all_chunks]
for c in all_chunks:
    print(len(c["text"]))
print(len(texts))

In [None]:
#because the total size of the arrays are too big, you must batch

#batching! 

BATCH_SIZE = 100
def batch_then_store():
    for i in range(0, len(all_chunks), BATCH_SIZE):
        batch = all_chunks[i:i+BATCH_SIZE]
        vectorstore.add_texts(
            texts= [c["text"] for c in batch],
            metadatas= [{"source": c["source"]} for c in batch]
        )

In [None]:
print("batching and adding to vectorstore")
batch_then_store()

In [None]:
###create vector store (using pinecone)
"""
vectorstore = PineconeVectorStore.from_existing_index(
    embedding=embeddings,
    index_name=index_name
)



#dis too large, batch it
batch_size = 100
def batch_list(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

for batch in batch_list(all_chunks, batch_size):
    texts = [c["text"] for c in batch]
    metadata = [{"source": os.path.basename(c["source"])} for c in batch]
    vectorstore.add_texts(
     texts=texts,
     metadatas=metadata
    )

"""


In [106]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 6,
        "fetch_k": 50
    }
)

In [105]:
from operator import itemgetter

from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI

system_prompt = """
You are a strict RAG model. 
Only answer using the provided context.
Do not show literal \\n characters — output real line breaks.
If the answer is not contained in the context, respond with:
"I don't know — no supporting text found."
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", """
    You are a strict RAG model. 
    Only answer using the provided context.
    Do not show literal \\n characters — output real line breaks.
    If the answer is not contained in the context, respond with:
    "I don't know — no supporting text found."
    """),
    ("human", """
    Question:
    {query}
    Context:
    {context}
    """)
])

def format_docs(docs):
    return "\n\n".join(
        [f"Source: {d.metadata}\nContent: {d.page_content}"
         for d in docs
        ])

chain = (
    {
        "query": RunnablePassthrough(), 
        "context": retriever | RunnableLambda(format_docs)}
    | prompt
    | llm
    | StrOutputParser()
)


resp = chain.invoke("what is a good book to read if i like technology and comedy?")
print(resp)

Cryptonomicon by Neal Stephenson. It’s described as hilarious, tech-forward, with witty dialogue and engaging action.


In [None]:
runnable is basically doing:

query = user_input
docs = retriever(user_input)
context = format_docs(docs)
prompt_text = prompt.format(query=query, context=context)
resp = llm(prompt_text)
print(resp)