In [None]:
import getpass
import os
import bs4
from typing import List
import requests
from bs4 import BeautifulSoup
from langchain.agents import create_agent
from langchain.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma


os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
model_name = os.getenv("LM_MODEL")
base_url = os.getenv("OPENAI_BASE_URL")

model = init_chat_model(
    model_name,
    model_provider="openai",
    base_url=base_url,
    api_key="lm-studio"
    )

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chomksy-db", 
)

Load, Split, Embed, Store

In [None]:


url = "https://chomsky.info/articles/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

article_links = []
for a in soup.select(".entry-content a"):
    href = a.get("href", "")
    # Avoid anchors, archives, and index pages
    if href.startswith("https://chomsky.info/") and "/articles/" not in href and "/books/" not in href and "#" not in href and "pdf" not in href and href not in article_links:
        article_links.append(href)

# If the above yields too few, fallback to all single-article permalinks in the list
if len(article_links) < 10:
    article_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("https://chomsky.info/") and ("articles/" not in href) and ("#" not in href) and ("pdf" not in href) and href not in article_links:
            article_links.append(href)

print(f"Found {len(article_links)} article links")
loader = WebBaseLoader(article_links)

docs = loader.load()


def get_chomsky_article_links(soup) -> List[str]:
    # Find the "entry-content" div
    entry = soup.find("div", class_="entry-content")
    article_links = []
    if entry:
        # Find the first <ul> inside entry-content
        ul = entry.find("ul")
        if ul:
            for a in ul.find_all("a", href=True):
                href = a["href"]
                # Exclude PDFs and duplicates
                if href.startswith("https://chomsky.info/") and "pdf" not in href and href not in article_links:
                    article_links.append(href)
    return article_links

article_links = get_chomsky_article_links(soup)
print(f"EXACT, from main <ul>: {len(article_links)} article links found:")
for link in article_links[:5]:
    print(link)

if len(article_links) < 10:
    print("Warning: <ul> found less than 10 articles; falling back to old method.")
    article_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("https://chomsky.info/") and ("articles/" not in href) and ("#" not in href) and ("pdf" not in href) and href not in article_links:
            article_links.append(href)

print(f"Final: {len(article_links)} article links.")
loader = WebBaseLoader(article_links)
docs = loader.load()
total_chars = sum(len(doc.page_content) for doc in docs)
print(f"Total characters across all pages: {total_chars}")

Found 310 article links
EXACT, from main <ul>: 0 article links found:
Final: 310 article links.
Total characters across all pages: 7579283


In [None]:

# chunk_overlap specifies the number of characters of overlap between consecutive text chunks,
# which helps preserve context between splits.
# add_start_index, when set to True, includes the start index of the chunk in the split metadata.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,     
    add_start_index=True  
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks")

Split into 11609 chunks


In [6]:
# Add documents in batches to avoid exceeding Chroma's max batch size
batch_size = 5000  
document_ids = []

for i in range(0, len(all_splits), batch_size):
    batch = all_splits[i:i + batch_size]
    batch_ids = vector_store.add_documents(batch)
    document_ids.extend(batch_ids)
    print(f"Added batch {i//batch_size + 1}/{(len(all_splits) + batch_size - 1)//batch_size} ({len(batch)} documents)")

print(f"\nTotal documents added: {len(document_ids)}")

Added batch 1/3 (5000 documents)
Added batch 2/3 (5000 documents)
Added batch 3/3 (1609 documents)

Total documents added: 11609


Construct the agent

In [None]:


@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

tools = [retrieve_context]
prompt = (
    "You are a helpful assistant that can answer questions about Noam Chomsky's articles."
)

agent = create_agent(model, tools, system_prompt=prompt)

Test the agent

In [1]:
query = "What did Chomsky say about Jubilee 2000?"

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()

NameError: name 'agent' is not defined