In [37]:
import os
from dotenv import load_dotenv 
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from tavily import TavilyClient

In [15]:
pip install ebooklib

Collecting ebooklib
  Downloading ebooklib-0.20-py3-none-any.whl.metadata (6.3 kB)
Collecting lxml (from ebooklib)
  Downloading lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl.metadata (3.6 kB)
Downloading ebooklib-0.20-py3-none-any.whl (40 kB)
Downloading lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m31.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: lxml, ebooklib
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [ebooklib]
[1A[2KSuccessfully installed ebooklib-0.20 lxml-6.0.2
[0mNote: you may need to restart the kernel to use updated packages.


In [38]:
from ebooklib import epub 

load_dotenv(dotenv_path="/app/.env")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")

#intiialize LLM
llm = ChatOpenAI(
    model="gpt-5-nano"
)

In [78]:
tavily_client = TavilyClient()

def get_search(topic: str) -> str: 
    """ Performs a web search on a certain topic """
    response = tavily_client.search(topic)
    return response

def generate_response(prompt):
    msg = ChatPromptTemplate.from_messages([ 
        ("system", "You are a helpful assistant that will help summarize blocks of info from a given search topic"), 
        ("user", "{prompt}") ])    
    #dont forget this step 
    formatted_msg = msg.format_messages(prompt=prompt)
    result = llm.invoke(formatted_msg) 
    return result.content

In [51]:
search_query = get_search("tell me about the history of south korea")
#generate llm response 
answer = generate_response(search_query)
print(answer)


Here’s a concise summary of South Korea’s history based on the sources you provided, organized from ancient foundations to the modern era.

- Ancient foundations and pre-modern era
  - Korea’s early history includes the Three Kingdoms period (Goguryeo, Baekje, and Silla) and later unification and cultural development in subsequent eras.
  - The Joseon Dynasty (1392–1910) is highlighted as a time of significant cultural, literary, philosophical, and artistic development.

- Early 20th century to mid-century (highlights)
  - The peninsula was divided after World War II, with control split between the Soviet Union in the north and the United States in the south.
  - In 1948, the Republic of Korea (South Korea) was established with Seoul as its capital.

- Postwar politics and democratization
  - South Korea pursued a path from military rule toward democracy in the late 20th century.
  - The country elected its first civilian president in more than 30 years in 1993 (Kim Young-sam), signali

In [73]:
#test file searching
from bs4 import BeautifulSoup
import json 

#trawl the non fiction directory 
def crawl_directories(root):
    for folders in os.listdir(root):
        folder_path = os.path.join(root, folders)
        if not os.path.isdir(folder_path):
            continue
            
        for files in os.listdir(folder_path):
            file_path = os.path.join(folder_path, files)
            if not files.lower().endswith(".epub"):
                continue
            try:
                nihao = epub.read_epub(file_path)
                title = nihao.get_metadata("DC", "title")
                author = nihao.get_metadata("DC", "creator")
                description = nihao.get_metadata("DC", "description")
                if len(description) == 0:
                    desc = generate_description(title, author)
                    nihao.add_metadata("DC", "description", desc)
                    epub.write_epub(file_path, nihao)
                print("creator:", nihao.get_metadata("DC", "creator"))
                #if description is missing, have AI fill it
            except Exception as e:
                #identify the corrupt epubs
                print("BAD EPUB:", file_path, "=>", e)
                continue

#if the description is missing, fill it out
def generate_description(title, author):
    prompt = ChatPromptTemplate([
        ("system", """You are a professional summarizer well versed in all the books in the world. You are able to recall
        any information about a book. Your task will be to add in a description of what the book is about in less than 120 words.
        You should not spoil too much about the book, and only fill in the parts to let any reader understand what they are 
        about to read.

        Only return the summary and nothing else. Do not introduce yourself, and do not ask any other questions. Your job is only to print out 
        the description, and the description only. Do not mention the author or the title in your answer.
        
        Description:
        """),
        ("user", "{title}, {author}")
    ])

    formatted_prompt = prompt.format_messages(title=title, author=author)
    result = llm.invoke(formatted_prompt)
    return result.content

    
    

In [74]:
#testing
#generate_description("i, robot", "isaac asimov")

In [79]:
BOOK_DIR = "/epubs"
OUTPUT= "/output"
FICTION = BOOK_DIR+"/Fiction"
NON_FICTION = BOOK_DIR+"/Non-Fiction"

#TRAWL DAT 
crawl_directories(NON_FICTION)
crawl_directories(FICTION)

creator: [('Abraham Lincoln', {'{http://www.idpf.org/2007/opf}role': 'aut', '{http://www.idpf.org/2007/opf}file-as': 'Lincoln, Abraham'})]
creator: [('Walter Isaacson', {'{http://www.idpf.org/2007/opf}role': 'aut', '{http://www.idpf.org/2007/opf}file-as': 'Isaacson, Walter'})]
creator: [('Vincent Cronin', {'{http://www.idpf.org/2007/opf}role': 'aut', '{http://www.idpf.org/2007/opf}file-as': ''})]
creator: [('Isaacson, Walter', {'{http://www.idpf.org/2007/opf}file-as': 'Isaacson, Walter', '{http://www.idpf.org/2007/opf}role': 'aut'})]
creator: [('Blaine Harden', {'{http://www.idpf.org/2007/opf}role': 'aut', '{http://www.idpf.org/2007/opf}file-as': 'Blaine, Harden'})]
creator: [('Joseph Kim', {'{http://www.idpf.org/2007/opf}file-as': 'Kim, Joseph', '{http://www.idpf.org/2007/opf}role': 'aut'})]
creator: [('Kathleen Dalton', {'{http://www.idpf.org/2007/opf}file-as': 'Dalton, Kathleen', '{http://www.idpf.org/2007/opf}role': 'aut'})]
creator: [('Walter Isaacson', {'id': 'id-1'})]
creator: [