### Notebook Setup

In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
import os
import requests
from dotenv import load_dotenv

### Check API Auth to Readwise

In [1]:
load_dotenv()
token = os.getenv("READWISE_API_TOKEN")
headers = {"Authorization": f"Token {token}"}

response = requests.get("https://readwise.io/api/v2/auth/", headers=headers)
print("Status code:", response.status_code)

if response.status_code == 204:
    print("✅ Token is valid! You’re authenticated with Readwise.")
else:
    print("❌ Token invalid or expired. Check your .env file or regenerate it.")

Status code: 204
✅ Token is valid! You’re authenticated with Readwise.


In [14]:
# print the head of the data json
data['results'][0]

{'id': 55262007,
 'title': 'The Miracle of Mindfulness',
 'author': 'Thich Nhat Hanh, Vo-Dihn Mai, and Mobi Ho',
 'category': 'books',
 'source': 'kindle',
 'num_highlights': 30,
 'last_highlight_at': '2025-10-24T04:56:00.000000Z',
 'updated': '2025-10-24T04:02:41.043968Z',
 'cover_image_url': 'https://images-na.ssl-images-amazon.com/images/I/41v%2B00gXxyL._SL200_.jpg',
 'highlights_url': 'https://readwise.io/bookreview/55262007',
 'source_url': None,
 'asin': 'B009U9S6VM',
 'tags': [],
 'document_note': ''}

In [16]:
import pandas as pd

df_books = pd.DataFrame(data["results"])
print(df_books[["id", "title", "author", "num_highlights"]])

         id                       title  \
0  55262007  The Miracle of Mindfulness   
1  55650413                  Click Here   
2  55152119                 Mindfulness   
3  55400541          Conscious Business   

                                      author  num_highlights  
0  Thich Nhat Hanh, Vo-Dihn Mai, and Mobi Ho              30  
1                               Alex Schultz              62  
2                           Joseph Goldstein              28  
3                                Fred Kofman               4  


### Check OpenAI API Key and Connection

In [14]:
##############################################################
### check openai api key
##############################################################

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("OPENAI_API_KEY found and loaded successfully")
    # Additional validation: check if it looks like a valid OpenAI key format
    if api_key.startswith("sk-") and len(api_key) > 40:
        print("API key format appears valid")
    else:
        print("API key format may be incorrect (should start with 'sk-')")
else:
    print("OPENAI_API_KEY not found in environment variables")
    print("Make sure you have a .env file with OPENAI_API_KEY=your_key_here")

if api_key is None:
    print("OPENAI_API_KEY not found in environment variables")
    print("Make sure you have a .env file with OPENAI_API_KEY=your_key_here")
    exit()

OPENAI_API_KEY found and loaded successfully
API key format appears valid


In [16]:
##############################################################
### check openai api connection
##############################################################

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

try:
    response = client.responses.create(
        model="gpt-5-mini",
        input="Hello! Respond hi if you're working."
    )
    print("✅ API connection successful!")
    print("Response:", response.output_text)

except Exception as e:
    print("❌ API connection failed.")
    print("Error:", e)

✅ API connection successful!
Response: hi


### OpenAI Input Prompt Cost Estimation

In [None]:
import tiktoken

def num_tokens_in_prompt(prompt: str, model: str) -> int:
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(prompt)
    return model, len(tokens)

example_prompt = "What is the capital of Texas?"

# Example usage:
num_tokens_in_prompt(example_prompt, "gpt-5")


('gpt-5', 7)

In [9]:
from io import StringIO

# manual grab model text cost; couldn't get the web scrape to work, manual for now
# https://platform.openai.com/docs/pricing#text-tokens

# save the table as a markdown file
with open("gpt_model_pricing.md", "w") as f:
    f.write("""
|Model|Input|Cached input|Output|
|---|---|---|---|
|gpt-5|$1.25|$0.125|$10.00|
|gpt-5-mini|$0.25|$0.025|$2.00|
|gpt-5-nano|$0.05|$0.005|$0.40|
|gpt-5-chat-latest|$1.25|$0.125|$10.00|
|gpt-5-codex|$1.25|$0.125|$10.00|
|gpt-5-pro|$15.00|-|$120.00|
|gpt-4.1|$2.00|$0.50|$8.00|
|gpt-4.1-mini|$0.40|$0.10|$1.60|
|gpt-4.1-nano|$0.10|$0.025|$0.40|
|gpt-4o|$2.50|$1.25|$10.00|
|gpt-4o-2024-05-13|$5.00|-|$15.00|
|gpt-4o-mini|$0.15|$0.075|$0.60|
|gpt-realtime|$4.00|$0.40|$16.00|
|gpt-realtime-mini|$0.60|$0.06|$2.40|
|gpt-4o-realtime-preview|$5.00|$2.50|$20.00|
|gpt-4o-mini-realtime-preview|$0.60|$0.30|$2.40|
|gpt-audio|$2.50|-|$10.00|
|gpt-audio-mini|$0.60|-|$2.40|
|gpt-4o-audio-preview|$2.50|-|$10.00|
|gpt-4o-mini-audio-preview|$0.15|-|$0.60|
|o1|$15.00|$7.50|$60.00|
|o1-pro|$150.00|-|$600.00|
|o3-pro|$20.00|-|$80.00|
|o3|$2.00|$0.50|$8.00|
|o3-deep-research|$10.00|$2.50|$40.00|
|o4-mini|$1.10|$0.275|$4.40|
|o4-mini-deep-research|$2.00|$0.50|$8.00|
|o3-mini|$1.10|$0.55|$4.40|
|o1-mini|$1.10|$0.55|$4.40|
|codex-mini-latest|$1.50|$0.375|$6.00|
|gpt-5-search-api|$1.25|$0.125|$10.00|
|gpt-4o-mini-search-preview|$0.15|-|$0.60|
|gpt-4o-search-preview|$2.50|-|$10.00|
|computer-use-preview|$3.00|-|$12.00|
|gpt-image-1|$5.00|$1.25|-|
|gpt-image-1-mini|$2.00|$0.20|-|
""")

# convert markdown table to data frame
# Read table rows from markdown, parsing only lines that start with '|'
with open("gpt_model_pricing.md", "r") as f:
    table_lines = [line for line in f if line.strip().startswith("|")]

# Combine into a single string and let pandas parse, removing unnamed columns
table_str = "".join(table_lines)
per_token_cost_df = pd.read_csv(
    StringIO(table_str),
    sep="|",
    engine="python",
    skipinitialspace=True
).loc[:, lambda d: ~d.columns.str.contains('^Unnamed')]

# Strip spaces from column names and all string values using map for all columns of object dtype
per_token_cost_df.columns = per_token_cost_df.columns.str.strip()
str_cols = per_token_cost_df.select_dtypes(include="object").columns
per_token_cost_df[str_cols] = per_token_cost_df[str_cols].map(lambda x: x.strip() if isinstance(x, str) else x)

# if model is "---" then drop the row
per_token_cost_df = per_token_cost_df[per_token_cost_df['Model'] != '---']

cols = ["Input", "Cached input", "Output"]
per_token_cost_df[cols] = (
    per_token_cost_df[cols]
    .replace('[\$,]', '', regex=True)     # remove $ and commas
    .replace('-', '0', regex=False)       # optional: treat '-' as 0; or skip this line to get NaN
    .apply(pd.to_numeric, errors='coerce')  # safely convert to float
)

# frame as cost per token; divide by 1M
per_token_cost_df[['Input','Cached input','Output']] /= 1e6

per_token_cost_df.head()

Unnamed: 0,Model,Input,Cached input,Output
1,gpt-5,1.25e-06,1.25e-07,1e-05
2,gpt-5-mini,2.5e-07,2.5e-08,2e-06
3,gpt-5-nano,5e-08,5e-09,4e-07
4,gpt-5-chat-latest,1.25e-06,1.25e-07,1e-05
5,gpt-5-codex,1.25e-06,1.25e-07,1e-05


In [10]:

# function to calculate cost of prompt
def calculate_prompt_cost(prompt: str, model: str) -> float:
    # get the number of tokens in the prompt
    num_tokens = num_tokens_in_prompt(prompt, model)[1]
    # Ensure num_tokens is an integer (not str/array/etc.)
    try:
        num_tokens_int = int(num_tokens)
    except Exception as e:
        raise ValueError(f"num_tokens_in_prompt returned non-integer: {num_tokens!r}") from e
    # get the cost per token from the dataframe (get the scalar float, not a Series)
    cost_per_token = per_token_cost_df.query(f'Model == "{model}"')['Input'].iloc[0]
    # Ensure cost_per_token is a float
    try:
        cost_per_token = float(cost_per_token)
    except Exception as e:
        raise ValueError(f"per_token_cost_df returned non-numeric cost: {cost_per_token!r}") from e
    total_cost = num_tokens_int * cost_per_token
    print(f"Model: {model}")
    print(f"Number of tokens: {num_tokens_int:,}")
    print(f"Cost estimate: ${total_cost:.8f}")

In [11]:
calculate_prompt_cost(example_prompt, "gpt-5")

Model: gpt-5
Number of tokens: 7
Cost estimate: $0.00000875


### Readwise API: Get All Books with Highlights

In [25]:
# Future TODO: only get new books from the API and append to the existing df_books.csv
# Future TODO: move to using a database instead of CSVs
 
load_dotenv()
token = os.getenv("READWISE_API_TOKEN")
headers = {"Authorization": f"Token {token}"}

def fetch_books_by_category(category="books", save_path='df_books.csv'):
    url = "https://readwise.io/api/v2/books/"
    params = {"category": category, "page_size": 100}
    all_books = []

    while url:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()

        all_books.extend(data["results"])
        url = data.get("next")
        params = None  # after first call, pagination URLs already include params

    df_books = pd.DataFrame(all_books)

    # generate a new column in df_books that is the concat of title and author
    df_books['title_author'] = df_books['title'] + ' by ' + df_books['author']

    # filter out row where title = Quick Passages
    df_books = df_books[df_books['title'] != 'Quick Passages']

    # Add save logic: only write to CSV if the file does not already exist
    if not os.path.exists(save_path):
        df_books.to_csv(save_path, index=False)
    else:
        print(f"File '{save_path}' already exists.")

    return df_books

if os.path.exists('df_books.csv'):
    print("✅ Found existing df_books.csv. Loading from file...")
    df_books = pd.read_csv('df_books.csv')
else:
    df_books = fetch_books_by_category("books")
    print(f"✅ Retrieved {len(df_books)} books.")
    print(df_books[["id", "title", "author", "num_highlights"]].head())

✅ Found existing df_books.csv. Loading from file...


### OpenAI API: Book Categorization Agent

In [47]:
if os.path.isfile('book_categorization.csv'):
    print("✅ Found existing book_categorization.csv. Loading from working directory...")
    df_book_cats = pd.read_csv('book_categorization.csv')
else:
    load_dotenv()
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    titles_csv = df_books['title_author'].to_csv(index=False, header=True)

    categorization_prompt = (
        "You are an expert librarian and literary classifier who categorizes books by their dominant subject area.\n"
        "Use your knowledge and, if necessary, live web search to determine accurate categories.\n\n"
        "Return results in **CSV** format using `::` as the column separator with two columns:\n"
        "\"title_author\"::\"category\"\n\n"
        "Choose **one** category per book from this list:\n"
        "- Business Strategy\n"
        "- Finance or Investing or Economics or Accounting\n"
        "- Building Product or Startups\n"
        "- Marketing or Sales\n"
        "- Leadership or Management\n"
        "- Data Analytics, Statistics, or AI\n"
        "- Self-Help or Motivational or Inspirational\n"
        "- Other\n\n"
        "Rules:\n"
        "1. Use the main theme or subject of the book (not keywords in the title).\n"
        "2. If a book clearly spans two areas, choose the most dominant.\n"
        "3. If no clear match, use \"Other\".\n"
        "4. Output only the CSV — no commentary, no markdown, no code block.\n\n"
        "Here is the CSV input with the column \"title_author\":\n"
        f"{titles_csv}"
    )

    try:
        response = client.responses.create(
            model="gpt-5",
            input=[{"role": "user", "content": categorization_prompt}],
            tools=[{"type": "web_search"}],
            tool_choice="auto",
            reasoning={"effort": "medium"}
        )
        categories_csv = response.output_text.strip()
        df_book_cats = pd.read_csv(StringIO(categories_csv.replace("::", ",")), header=None, names=["title_author", "category"])
        df_book_cats.to_csv('book_categorization.csv', index=False)
        print("=== BOOK CATEGORIZATION RESULTS ===\n", categories_csv)
        print("\n✅ Categorization completed successfully!")
    except Exception as e:
        print(f"❌ Error calling OpenAI API for book categorization: {e}")

✅ Found existing book_categorization.csv. Loading from working directory...


In [49]:
df_book_cats['category'].value_counts()

category
Self-Help or Motivational or Inspirational         46
Data Analytics, Statistics, or AI                  35
Leadership or Management                           19
Other                                              16
Finance or Investing or Economics or Accounting    15
Marketing or Sales                                 14
Building Product or Startups                       13
Business Strategy                                   8
Name: count, dtype: int64

In [50]:
# drop category column from df_books; the category is book for all rows given filters above
df_books.drop(columns=['category'], inplace=True)

In [52]:
# join agent categorization to main df_books data
df_books_2 = df_books.merge(df_book_cats, on='title_author', how='left').copy()

### Get Book Highlights from Marketing & Sales Books

In [54]:
# get marketing and sales book ids
# ids are a readwise id for a book
marketing_ids = df_books_2.query('category == "Marketing or Sales"')['id'].tolist()

In [56]:
def fetch_readwise_highlights(book_ids):
    """
    Given a list of Readwise book IDs, fetch all highlights and notes using the Readwise export API.

    Args:
        book_ids (list of int): List of numeric Readwise book IDs.

    Returns:
        list: List of book JSON objects from Readwise containing highlights and notes.
    """
    load_dotenv()
    token = os.getenv("READWISE_API_TOKEN")
    headers = {"Authorization": f"Token {token}"}
    ids_param = ",".join(str(i) for i in book_ids)
    base_url = "https://readwise.io/api/v2/export/"

    all_data = []
    params = {"ids": ids_param}
    next_page_cursor = None

    while True:
        if next_page_cursor:
            params["pageCursor"] = next_page_cursor
        else:
            params.pop("pageCursor", None)

        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()
        result = response.json()

        # Use pandas to efficiently concatenate list of dicts if enough data accumulates
        all_data.extend(result["results"])
        next_page_cursor = result.get("nextPageCursor")

        print(f"📘 Retrieved {len(result['results'])} books in this page. "
              f"Total so far: {len(all_data)}")

        if not next_page_cursor:
            break  # no more pages

    print(f"✅ Finished fetching {len(all_data)} total books from Readwise export")
    return all_data

In [37]:
def flatten_readwise_highlights(all_data):
    """
    Flatten nested Readwise book highlight JSON into a DataFrame.

    Args:
        all_data (list): List of book JSON objects from Readwise containing highlights and notes.

    Returns:
        pd.DataFrame: DataFrame with highlights and book metadata flattened.
    """
    df = pd.json_normalize(
        all_data,
        record_path="highlights",
        meta=["user_book_id", "title", "author", "category"],
        record_prefix="highlight_"
    )
    return df

In [None]:
raw_rw_highlights = fetch_readwise_highlights(marketing_ids)

rw_hl_df = flatten_readwise_highlights(raw_rw_highlights)

📘 Retrieved 9 books in this page. Total so far: 9
📘 Retrieved 5 books in this page. Total so far: 14
✅ Finished fetching 14 total books from Readwise export


Unnamed: 0,highlight_id,highlight_is_deleted,highlight_text,highlight_location,highlight_location_type,highlight_note,highlight_color,highlight_highlighted_at,highlight_created_at,highlight_updated_at,...,highlight_url,highlight_book_id,highlight_tags,highlight_is_favorite,highlight_is_discard,highlight_readwise_url,user_book_id,title,author,category
0,948377431,False,ARPU is usually the annual revenue that a comp...,201,location,,yellow,2025-10-16T04:56:00Z,2025-10-16T05:02:56.015Z,2025-10-16T05:02:56.015Z,...,,55650413,[],False,False,https://readwise.io/open/948377431,55650413,Click Here,Alex Schultz,books
1,948377432,False,Choosing the right channel means asking yourse...,203,location,And at what cost and scale,yellow,2025-10-16T04:56:00Z,2025-10-16T05:02:56.015Z,2025-10-16T05:02:56.015Z,...,,55650413,[],False,False,https://readwise.io/open/948377432,55650413,Click Here,Alex Schultz,books
2,948377433,False,A CRU is a user who has registered and then co...,213,location,,yellow,2025-10-16T04:56:00Z,2025-10-16T05:02:56.015Z,2025-10-16T05:02:56.015Z,...,,55650413,[],False,False,https://readwise.io/open/948377433,55650413,Click Here,Alex Schultz,books
3,948377434,False,DSPs arrived in the 2010s to simplify ad buyin...,221,location,Whags the back story? Do dsp aggregate demand?,yellow,2025-10-16T04:56:00Z,2025-10-16T05:02:56.015Z,2025-10-16T05:02:56.015Z,...,,55650413,[],False,False,https://readwise.io/open/948377434,55650413,Click Here,Alex Schultz,books
4,948377435,False,eCPM is a metric that estimates how much an ad...,225,location,Wherfe is this used,yellow,2025-10-16T04:56:00Z,2025-10-16T05:02:56.015Z,2025-10-16T05:02:56.015Z,...,,55650413,[],False,False,https://readwise.io/open/948377435,55650413,Click Here,Alex Schultz,books


In [67]:
# get only columns needed for synthesis
highlights_slim = rw_hl_df[['title', 'author', 'highlight_text']].copy()

# concat title and author
highlights_slim['title_author'] = highlights_slim['title'] + ' by ' + highlights_slim['author']

# drop title and author
highlights_slim.drop(columns=['title', 'author'], inplace=True)

# filter to only include title_author instances with 30 or more highlights
highlights_to_synthesize = (
    highlights_slim
    # window function type logic to get number of highlights per book
    .assign(count=lambda d: d['title_author'].map(d['title_author'].value_counts()))
    # if count is 30 or more, keep the row else drop it
    .query('count >= 30')
    # drop the count helper column
    .drop(columns='count')
)

highlights_to_synthesize.head()

Unnamed: 0,highlight_text,title_author
0,ARPU is usually the annual revenue that a comp...,Click Here by Alex Schultz
1,Choosing the right channel means asking yourse...,Click Here by Alex Schultz
2,A CRU is a user who has registered and then co...,Click Here by Alex Schultz
3,DSPs arrived in the 2010s to simplify ad buyin...,Click Here by Alex Schultz
4,eCPM is a metric that estimates how much an ad...,Click Here by Alex Schultz


In [65]:
highlights_to_synthesize['title_author'].value_counts()

title_author
Hacking Growth by Sean Ellis and Morgan Brown               318
Ogilvy on Advertising by David Ogilvy                       157
Converted by Neil Hoyne                                      86
Hook Point by Brendan  Kane                                  81
Click Here by Alex Schultz                                   62
Everybody Writes by Ann Handley                              51
Retention Point by Robert Skrob                              42
Audience-Ology by Kevin Goetz                                42
Growth Hacker Marketing by Ryan Holiday                      41
Monetizing Innovation by Madhavan Ramanujam, Georg Tacke     31
Name: count, dtype: int64

### OpenAI API: Summarize Key Ideas Agent

In [70]:
import json
from textwrap import dedent

def build_key_ideas_prompt(book_data, focus_instructions):
    """
    Build the structured prompt for the Key Ideas Agent.

    Args:
        book_data (dict): Dictionary containing book metadata and highlights.
            Example:
            {
              "title_author": "Click Here by Alex Schultz",
              "category": "Marketing or Sales",
              "highlights": [
                {"highlight_id": 1, "text": "ARPU is annual revenue per user."},
                {"highlight_id": 2, "text": "Choosing the right channel..."}
              ]
            }
        focus_instructions (str): Guidance for what highlights to emphasize,
            e.g. "Focus on fundamental ideas to marketing such as positioning and targeting."

    Returns:
        str: Complete formatted prompt ready for the OpenAI API.
    """
    title_author = book_data["title_author"]
    highlights_json = json.dumps(book_data["highlights"], indent=2)

    prompt = dedent(f"""
    You are the Key Ideas Agent.

    Your goal is to extract and summarize the key ideas from this book’s highlights.
    You must focus entirely on what is expressed in the highlights.
    If a highlight is incomplete or lacks context, you may perform a light web search
    to fill missing definitions or clarify references — but never introduce new ideas
    not grounded in the highlights.

    ### 🧭 Focus
    {focus_instructions}

    ### Book Information
    - Title & Author: {title_author}

    ### Highlights
    {highlights_json}

    ### 🧩 Task
    1. Extract the main conceptual ideas found within these highlights.
    2. Ground every idea in one or more highlight_ids.
    3. Output valid JSON exactly following this schema:

    {{
      "book": {{
        "title_author": "string"
      }},
      "focus": "string",
      "key_ideas": [
        {{
          "id": "string",
          "title": "string",
          "definition": "string",
          "supporting_highlights": [
            {{
              "highlight_id": "integer",
              "text": "string",
              "note": "string | null"
            }}
          ],
          "web_context": [
            {{
              "source_url": "string",
              "context_summary": "string"
            }}
          ]
        }}
      ],
      "agent_summary": {{
        "summary_text": "string",
        "highlight_count": "integer",
        "token_estimates": {{
          "input_tokens": "integer",
          "output_tokens": "integer"
        }}
      }}
    }}
    """)
    return prompt

In [79]:
def df_to_book_json(df: pd.DataFrame):
    """Convert DataFrame of highlights into JSON grouped by book."""
    books = []
    for title_author, group in df.groupby("title_author"):
        highlights = [
            {"highlight_id": i + 1, "text": text}
            for i, text in enumerate(group["highlight_text"])
        ]
        books.append({"title_author": title_author, "highlights": highlights})
    return books

In [80]:
def generate_marketing_prompt(book_title_author, highlights_df, focus_instructions=None):
    """
    Generate a prompt for the Key Ideas Agent based on book input.

    Args:
        book_title_author (str): The combined 'title_author' string for the target book.
        highlights_df (pd.DataFrame): DataFrame containing highlight data.
        focus_instructions (str, optional): Instructions for what to focus on in the prompt.

    Returns:
        str: The generated prompt text.
    """
    if focus_instructions is None:
        focus_instructions = (
            "Focus on fundamental ideas to marketing. Help folks new to marketing understand the core ideas of marketing."
        )
    book_json = df_to_book_json(highlights_df.query(f'title_author == "{book_title_author}"'))[0]
    prompt_text = build_key_ideas_prompt(book_json, focus_instructions)
    return prompt_text

In [86]:
def run_marketing_prompts_for_all_books(highlights_df: pd.DataFrame, focus_instructions: str | None = None) -> dict:
    """
    Run generate_marketing_prompt and call GPT for each unique book in the DataFrame.
    Returns a dictionary mapping title_author → response text (or structured JSON if applicable).
    """

    # --- Initialization ---
    load_dotenv()
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    results = {}

    if highlights_df.empty:
        print("⚠️ No highlights found. Exiting early.")
        return results

    # --- Group by book for cleaner iteration ---
    for title_author, group in highlights_df.groupby("title_author", sort=False):
        print(f"📘 Processing book: {title_author} ({len(group)} highlights)")

        try:
            # Build prompt once per book
            prompt = generate_marketing_prompt(title_author, group, focus_instructions)

            # Make model call
            response = client.responses.create(
                model="gpt-5-mini",
                input=[{"role": "user", "content": prompt}],
                tools=[{"type": "web_search"}],
                tool_choice="auto",
                reasoning={"effort": "low"}
            )

            # Parse structured JSON if valid, else fallback to text
            try:
                results[title_author] = json.loads(response.output_text)
            except json.JSONDecodeError:
                results[title_author] = {"raw_text": response.output_text}

            print(f"✅ Completed summary for: {title_author}")

        except Exception as e:
            print(f"❌ Error processing '{title_author}': {e}")
            results[title_author] = {"error": str(e)}

    print(f"\n🏁 Completed all books ({len(results)} processed).")
    return results


In [87]:
focus_instructions = (
    "Focus on fundamental ideas to marketing."
    "Help folks new to marketing understand the core ideas of marketing."
)

key_ideas_results = run_marketing_prompts_for_all_books(highlights_to_synthesize, focus_instructions)

📘 Processing book: Click Here by Alex Schultz (62 highlights)
✅ Completed summary for: Click Here by Alex Schultz
📘 Processing book: Ogilvy on Advertising by David Ogilvy (157 highlights)
✅ Completed summary for: Ogilvy on Advertising by David Ogilvy
📘 Processing book: Retention Point by Robert Skrob (42 highlights)
✅ Completed summary for: Retention Point by Robert Skrob
📘 Processing book: Growth Hacker Marketing by Ryan Holiday (41 highlights)
✅ Completed summary for: Growth Hacker Marketing by Ryan Holiday
📘 Processing book: Hacking Growth by Sean Ellis and Morgan Brown (318 highlights)
✅ Completed summary for: Hacking Growth by Sean Ellis and Morgan Brown
📘 Processing book: Monetizing Innovation by Madhavan Ramanujam, Georg Tacke (31 highlights)
✅ Completed summary for: Monetizing Innovation by Madhavan Ramanujam, Georg Tacke
📘 Processing book: Audience-Ology by Kevin Goetz (42 highlights)
✅ Completed summary for: Audience-Ology by Kevin Goetz
📘 Processing book: Everybody Writes by

In [None]:
import json
from pathlib import Path
from datetime import datetime

def save_json_results(results: dict, prefix: str = "results"):
    """
    Save a dictionary as a timestamped JSON file in the working directory.

    Args:
        results (dict): Dictionary of outputs to save.
        prefix (str): Optional filename prefix. Defaults to 'results'.

    Returns:
        Path: The full path to the saved JSON file.
    """
    name = input("Enter your name (for file naming, optional): ").strip()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_prefix = f"{prefix}_{name}" if name else prefix
    filename = f"{file_prefix}_{timestamp}.json"
    path = Path.cwd() / filename

    with path.open("w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"💾 Saved results to {path.resolve()}")
    return path


In [96]:
save_json_results(key_ideas_results, 'key_ideas_results')

💾 Saved results to /Users/brianmoore/Documents/GitHub/kindle_eda/key_ideas_results_20251026_091658.json


PosixPath('/Users/brianmoore/Documents/GitHub/kindle_eda/key_ideas_results_20251026_091658.json')

### OpenAI API: First Principles Agent Based on Key Ideas by Book

In [90]:
def run_first_principles_agent_from_key_ideas(key_ideas_results: dict, focus_instructions: str | None = None) -> dict:
    """
    Generate cross-book first principles from multiple Key Ideas Agent outputs using GPT-5.
    Uses light web search for incomplete context and returns structured JSON.
    """
    load_dotenv()
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # Normalize to parsed dicts containing key_ideas
    key_ideas_jsons = []
    for book, result in key_ideas_results.items():
        try:
            if isinstance(result, str):
                result = json.loads(result)
            if "key_ideas" in result:
                key_ideas_jsons.append(result)
        except Exception:
            print(f"⚠️ Skipping invalid entry: {book}")

    if not key_ideas_jsons:
        raise ValueError("No valid Key Ideas JSON found.")

    focus = focus_instructions or (
        "Synthesize the first principle marketing ideas from the books below."
    )

    # Compact prompt
    prompt = f"""
    You are the First Principles Agent.

    Goal: Derive timeless, cross-book first principles from the ideas below.
    Each object represents one book's key ideas on a shared theme.
    Use web search lightly to fill missing context, not expand scope.

    Focus: {focus}

    Input:
    {json.dumps(key_ideas_jsons, indent=2)}

    Output valid JSON:
    {{
    "first_principles": [
        {{
        "principle_id": "string",
        "principle_title": "string",
        "definition": "string",
        "derived_from_books": ["string"],
        "supporting_ideas": ["string"]
        }}
    ],
    "meta": {{
        "books_processed": "integer",
        "total_key_ideas": "integer",
        "generated_at": "YYYY-MM-DD"
    }}
    }}
    """.strip()

    print(f"🧠 Synthesizing first principles from {len(key_ideas_jsons)} books...")

    response = client.responses.create(
        model="gpt-5",
        input=[{"role": "user", "content": prompt}],
        tools=[{"type": "web_search"}],
        tool_choice="auto",
        reasoning={"effort": "low"}
    )

    try:
        return json.loads(response.output_text)
    except json.JSONDecodeError:
        return {"raw_text": response.output_text}

In [92]:
fp_results = run_first_principles_agent_from_key_ideas(key_ideas_results)

🧠 Synthesizing first principles from 10 books...


In [97]:
# save fp_results to a json file
save_json_results(fp_results, 'fp_results')

💾 Saved results to /Users/brianmoore/Documents/GitHub/kindle_eda/fp_results_20251026_091710.json


PosixPath('/Users/brianmoore/Documents/GitHub/kindle_eda/fp_results_20251026_091710.json')

In [100]:
# generate prompt to format fp_results into a markdown list
fp_prompt = f"""
Format the following first principles json into a markdown list. 
{json.dumps(fp_results, indent=2)}

Use the following format for each first principle:
### <First Principle JSON>
- Description: <Definition>
- Source Books: <Derived from>
- Example: <insert notable example from silicon valley>

Use light web search to find example story of the first principle in action. Limit to 1 or 2 sentences.
Do not change or text related to first principle, definition, or source books.
"""

# make openAI call to format fp_results into a markdown list
response = client.responses.create(
    model="gpt-5",
    input=[{"role": "user", "content": fp_prompt}],
    tools=[{"type": "web_search"}],
    tool_choice="auto",
    reasoning={"effort": "medium"}
)

print(response.output_text)

### FP1 — Start with a must‑have product and a clear promise
- Description: Marketing cannot compensate for a product people don’t truly want. Build or refine until the core value is obvious, then articulate a specific, customer-important promise the product reliably delivers.
- Source Books: Hacking Growth by Sean Ellis and Morgan Brown, Growth Hacker Marketing — Ryan Holiday, Ogilvy on Advertising by David Ogilvy
- Example: Slack pivoted from the failed game Glitch to a must‑have team messaging tool and led with the plainspoken strapline “Be less busy,” a clear promise that resonated with early adopters and enterprises alike. ([marketingweek.com](https://www.marketingweek.com/disruptive-brands/slack/?utm_source=openai))

### FP2 — Make the customer’s outcome the North Star
- Description: Define a single, ambitious goal and metric that represent the customer outcome you exist to create; align revenue and tactics to serve that outcome and reject efforts that trade it off.
- Source Book

In [101]:
# save response.output_text to a markdown file
# add timestamp to filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f'fp_results_{timestamp}.md', 'w') as f:
    f.write(response.output_text)

### OpenAI API: Generate Short Summary for Each Book Referenced

In [None]:
book_titles = "\n".join(highlights_to_synthesize['title_author'].unique())
book_summary_prompt = (
    "Describe each book and author in 1-2 sentences. Output in a markdown list.\n"
    f"{book_titles}"
)

'Describe each book and author in 1-2 sentences. Output in a markdown list.\nClick Here by Alex Schultz\nOgilvy on Advertising by David Ogilvy\nRetention Point by Robert Skrob\nGrowth Hacker Marketing by Ryan Holiday\nHacking Growth by Sean Ellis and Morgan Brown\nMonetizing Innovation by Madhavan Ramanujam, Georg Tacke\nAudience-Ology by Kevin Goetz\nEverybody Writes by Ann Handley\nHook Point by Brendan  Kane\nConverted by Neil Hoyne'

In [None]:
book_titles = "\n".join(highlights_to_synthesize['title_author'].unique())
book_summary_prompt = (
    "Describe each book and author in 1-2 sentences. Output in a markdown list.\n"
    f"{book_titles}"
)

book_summary_response = client.responses.create(
    model="gpt-5-mini",
    input=[{"role": "user", "content": book_summary_prompt}],
    tools=[{"type": "web_search"}],
    tool_choice="auto",
    reasoning={"effort": "low"}
)

print(book_summary_response.output_text)

# save book_summary_response.output_text to a markdown file
with open(f'book_summary_{timestamp}.md', 'w') as f:
    f.write(book_summary_response.output_text)

- Click Here — Alex Schultz  
  A practical guide by Facebook growth leader Alex Schultz that explains data-driven experimentation, A/B testing, and metrics strategies for scaling product engagement and growth. It blends case studies with tactical advice for using analytics to drive user acquisition and retention.

- Ogilvy on Advertising — David Ogilvy  
  A classic handbook from advertising pioneer David Ogilvy that combines timeless principles of creative strategy, brand building, and campaign management with real-world examples and witty industry insight. It emphasizes research, clarity, and the primacy of the consumer in effective ad work.

- Retention Point — Robert Skrob  
  Focused on the economics of customer retention, Robert Skrob’s book outlines systems and metrics to increase lifetime customer value by reducing churn and improving onboarding, service, and follow-up. It provides actionable processes for turning one-time buyers into repeat customers.

- Growth Hacker Marketi

### OpenAI API: Generate Marketing Fundamentals (based on highlights from Ogilvy on Advertising)

In [None]:
ogilvy_highlights = highlights_to_synthesize.query('title_author == "Ogilvy on Advertising by David Ogilvy"')

ogilvy_csv = ogilvy_highlights['highlight_text'].to_csv(index=False)

In [125]:
ogilvy_prompt = f"""
You are an expert in marketing and advertising fundamentals, trained in the style and philosophy of David Ogilvy.

You will receive a list of Kindle highlights in CSV format. Each row contains a quote or note written by David Ogilvy from his books, interviews, or writings.

Your task:
1. Read all the highlights carefully.
2. Group the highlights into 10 key principles that best capture David Ogilvy’s timeless fundamentals of marketing and advertising.
3. For each principle, provide:
   - A short, memorable title (3–6 words)
   - A clear description (1–2 sentences)
   - 1–2 representative quotes from the highlights that best illustrate the principle (if the highlight is an incomplete quote then use light web search to find the full quote or fill in the idea)
4. Number the principles.

Output format (in markdown only):

### <Principle>
- <Description>
- <Supporting Quotes/Ideas>

Do not output JSON or tables. Keep your response in readable markdown.
Focus on clarity, persuasion, and Ogilvy’s distinct voice and principles.

Input:
{ogilvy_csv}
"""

In [126]:
ogilvy_response = client.responses.create(
    model="gpt-5",
    input=[{"role": "user", "content": ogilvy_prompt}],
    tools=[{"type": "web_search"}],
    tool_choice="auto",
    reasoning={"effort": "medium"}
)

print(ogilvy_response.output_text)

# save ogilvy_response.output_text to a markdown file
with open(f'ogilvy_response_{timestamp}.md', 'w') as f:
    f.write(ogilvy_response.output_text)

### 1) Product First, Always
- Superior product performance is the foundation; advertising can only accelerate what people already find genuinely valuable.
- “The best way to increase the sale of a product is to improve the product.” • “The key to successful marketing is superior product performance.…If the consumer does not perceive any real benefits in the brand, then no amount of ingenious advertising and selling can save it.”

### 2) Homework Before Headlines
- Know more than anyone about the product, the market, and the competition; insight precedes imagination.
- “First, study the product you are going to advertise. The more you know about it, the more likely you are to come up with a big idea for selling it.” • “What distinguishes the great surgeon is that he knows more than other surgeons. It is the same with advertising agents. The good ones know more.”

### 3) Hunt the Big Idea
- Only a big, relevant, long-lived idea will win attention and drive action; otherwise your ad slip