In [2]:
import os
from dotenv import load_dotenv 
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from tavily import TavilyClient

In [3]:
from ebooklib import epub 

load_dotenv(dotenv_path="/app/.env")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

#intiialize LLM
llm = ChatOpenAI(
    model="gpt-5-nano"
)

In [4]:
tavily_client = TavilyClient()

def get_search(topic: str) -> str: 
    """ Performs a web search on a certain topic """
    response = tavily_client.search(topic)
    return response

def generate_response(prompt):
    msg = ChatPromptTemplate.from_messages([ 
        ("system", "You are a helpful assistant that will help summarize blocks of info from a given search topic"), 
        ("user", "{prompt}") ])    
    #dont forget this step 
    formatted_msg = msg.format_messages(prompt=prompt)
    result = llm.invoke(formatted_msg) 
    return result.content

In [5]:
search_query = get_search("tell me about the history of south korea")
#generate llm response 
answer = generate_response(search_query)
print(answer)


KeyboardInterrupt: 

In [None]:
book_directories = []

In [None]:
#test file searching
from bs4 import BeautifulSoup
import json 

#trawl the non fiction directory 
def crawl_directories(root):
    for folders in os.listdir(root):
        folder_path = os.path.join(root, folders)
        if not os.path.isdir(folder_path):
            continue
            
        for files in os.listdir(folder_path):
            file_path = os.path.join(folder_path, files)
            if not files.lower().endswith(".epub"):
                continue
            try:
                nihao = epub.read_epub(file_path)
                title = nihao.get_metadata("DC", "title")
                author = nihao.get_metadata("DC", "creator")
                description = nihao.get_metadata("DC", "description")
                if len(description) == 0:
                    desc = generate_description(title, author)
                    nihao.add_metadata("DC", "description", desc)
                    epub.write_epub(file_path, nihao)

                if file_path not in book_directories:
                    add_to_array(file_path)
            except Exception as e:
                #identify the corrupt epubs
                print("BAD EPUB:", file_path, "=>", e)
                continue

def add_to_array(file_path):
    book_directories.append(file_path)
    print(file_path, "appended")
    return

#if the description is missing, fill it out
def generate_description(title, author):
    prompt = ChatPromptTemplate([
        ("system", """You are a professional summarizer well versed in all the books in the world. You are able to recall
        any information about a book. Your task will be to add in a description of what the book is about in less than 120 words.
        You should not spoil too much about the book, and only fill in the parts to let any reader understand what they are 
        about to read.

        Only return the summary and nothing else. Do not introduce yourself, and do not ask any other questions. Your job is only to print out 
        the description, and the description only. Do not mention the author or the title in your answer.
        
        Description:
        """),
        ("user", "{title}, {author}")
    ])

    formatted_prompt = prompt.format_messages(title=title, author=author)
    result = llm.invoke(formatted_prompt)
    return result.content

    
    

In [None]:
#testing
#generate_description("i, robot", "isaac asimov")

In [None]:
BOOK_DIR = "/ebooks"
OUTPUT= "/output"
FICTION = BOOK_DIR+"/Fiction"
NON_FICTION = BOOK_DIR+"/Non-Fiction"

#TRAWL DAT 
crawl_directories(NON_FICTION)
crawl_directories(FICTION)


In [12]:
print(book_directories)

['/ebooks/Non-Fiction/Biography/Abraham Lincoln - The Life and Writings of Abraham Lincoln.epub', '/ebooks/Non-Fiction/Biography/Walter Isaacson - Benjamin Franklin An American Life.epub', '/ebooks/Non-Fiction/Biography/Vincent Cronin - Napoleon Bonaparte- Una biografia intima.epub', '/ebooks/Non-Fiction/Biography/Walter Isaacson - Steve Jobs\uf03a The Authorized Biography.epub', "/ebooks/Non-Fiction/Biography/Blaine Harden - Escape from Camp 14\uf03a One Man's Remarkable Odyssey from North Korea to Freedom in the West.epub", '/ebooks/Non-Fiction/Biography/Joseph Kim - Under the Same Sky- From Starvation in North Korea to Salvation in America.epub', '/ebooks/Non-Fiction/Biography/Kathleen Dalton - Theodore Roosevelt\uf03a A Strenuous Life.epub', '/ebooks/Non-Fiction/Biography/Walter Isaacson - Elon Musk.epub', '/ebooks/Non-Fiction/Biography/Benjamin Franklin - The Autobiography of Benjamin F..epub', '/ebooks/Non-Fiction/Biography/William Manchester - The Last Lion- Winston Spencer Chur

In [13]:
#prepare for encoding and chunking
import html2text

#convert all epubs to text, add to array 

def epub_to_text(path: str) -> str:
    book = epub.read_epub(path)
    texts = []

    for item in book.get_items():
        if isinstance(item, epub.EpubHtml):
            html = item.get_content().decode("utf-8", errors="ignore")

            # basic HTML tag removal
            cleaned = (
                html.replace("<br>", "\n")
                    .replace("<br/>", "\n")
                    .replace("</p>", "\n\n")
            )

            import re
            cleaned = re.sub("<.*?>", "", cleaned)  # strip remaining tags

            texts.append(cleaned)

    return "\n".join(texts)



In [14]:
all_texts = []
for path in book_directories:
    text = epub_to_text(path)
    all_texts.append(text)

In [15]:
#save texts to separate file 
for i, text in enumerate(all_texts):
    with open(f"/outputs/book_{i+1}.txt", "w", encoding="utf-8") as f:
        f.write(text)


In [18]:
pip install langchain_core langchain_pinecone

[0mNote: you may need to restart the kernel to use updated packages.


In [23]:
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore

In [24]:
index_name = os.getenv("PINECONE_INDEX_LIBRARIAN")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

In [32]:
#chunk dat data
all_chunks = []

for path in book_directories:
    text = epub_to_text(path)
    chunks = splitter.split_text(text)

    #associate metadata
    for chunk in chunks:
        all_chunks.append({
            "text": chunk,
            "source": path
        })

In [49]:

#create vector store (using pinecone)
vectorstore = PineconeVectorStore.from_existing_index(
    embedding=embeddings,
    index_name=index_name
)

#embed the chunks
embeddings = OpenAIEmbeddings()

#dis too large, batch it
batch_size = 100
def batch_list(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]




In [62]:
for batch in batch_list(all_chunks, batch_size):
    texts = [c["text"] for c in batch]
    metadata = [{"source": os.path.basename(c["source"])} for c in batch]
    vectorstore.add_texts(
     texts=texts,
     metadatas=metadata
    )


   

PineconeApiException: (429)
Reason: Too Many Requests
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 26 Nov 2025 05:27:03 GMT', 'Content-Type': 'application/json', 'Content-Length': '166', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '11', 'x-pinecone-request-id': '5080351926650051980', 'x-envoy-upstream-service-time': '3', 'server': 'envoy'})
HTTP response body: {"code":8,"message":"Request failed. You've reached your write unit limit for the current month (2000000). To continue writing data, upgrade your plan.","details":[]}
