In [6]:
import json

file_path = "links.txt"

base_url = "https://www.paulgraham.com/"

with open(file_path) as f:
    links = json.load(f)

full_links = [base_url + link for link in links]

print(full_links)

['https://www.paulgraham.com/greatwork.html', 'https://www.paulgraham.com/kids.html', 'https://www.paulgraham.com/selfindulgence.html', 'https://www.paulgraham.com/field.html', 'https://www.paulgraham.com/goodwriting.html', 'https://www.paulgraham.com/do.html', 'https://www.paulgraham.com/woke.html', 'https://www.paulgraham.com/writes.html', 'https://www.paulgraham.com/when.html', 'https://www.paulgraham.com/foundermode.html', 'https://www.paulgraham.com/persistence.html', 'https://www.paulgraham.com/reddits.html', 'https://www.paulgraham.com/google.html', 'https://www.paulgraham.com/best.html', 'https://www.paulgraham.com/superlinear.html', 'https://www.paulgraham.com/getideas.html', 'https://www.paulgraham.com/read.html', 'https://www.paulgraham.com/want.html', 'https://www.paulgraham.com/alien.html', 'https://www.paulgraham.com/users.html', 'https://www.paulgraham.com/heresy.html', 'https://www.paulgraham.com/words.html', 'https://www.paulgraham.com/goodtaste.html', 'https://www.pau

In [7]:
from dotenv import load_dotenv
import os

load_dotenv()

huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [8]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

In [9]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings

hf = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="PG_essays",
    embedding_function=hf,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [16]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from datetime import datetime

# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=full_links,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            ["title", "body"]
        )
    ),
)
try:
    docs = loader.load()
    for doc in docs:
        doc.metadata["chunk_source"] = "web scraping"
        doc.metadata["processing_date"] = str(datetime.now())
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
    all_splits = text_splitter.split_documents(docs)
    print(f"Loaded {len(all_splits)} chunks from {len(docs)} documents.")
except Exception as e:
    print(f"An error occurred while loading or processing documents: {e}")

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

Loaded 5444 chunks from 228 documents.


In [17]:
response = graph.invoke({"question": "Why do startups fail?"})
print(response["answer"])

Startups primarily fail because they do not make something users want. If a startup creates a product or service that users desire, it is likely to succeed regardless of other factors. Conversely, if they don't, failure is almost certain, as nearly all other reasons for failure funnel through this core issue.


In [18]:
print(response)

{'question': 'Why do startups fail?', 'context': [Document(id='35c429c7-8f38-425f-a045-dcfec3699e30', metadata={'processing_date': '2025-08-23 16:13:42.600490', 'chunk_source': 'web scraping', 'source': 'https://www.paulgraham.com/startupmistakes.html', 'title': 'The 18 Mistakes That Kill Startups'}, page_content="It's easier to catch yourself doing something you shouldn't than\nalways to remember to do something you should.\n[1]In a sense there's just one mistake that kills startups: not making\nsomething users want.  If you make something users want, you'll\nprobably be fine, whatever else you do or don't do.  And if you\ndon't make something users want, then you're dead, whatever else\nyou do or don't do.  So really this is a list of 18 things that\ncause startups not to make something users want.  Nearly all failure\nfunnels through that.1. Single FounderHave you ever noticed how few successful startups were founded by\njust one person?  Even companies you think of as having one fo