In [None]:
from dotenv import load_dotenv
load_dotenv()
import os
import bs4
from typing import List
import requests
from bs4 import BeautifulSoup
from langchain.agents import create_agent
from langchain.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
model_name = os.getenv("LM_MODEL")
base_url = os.getenv("OPENAI_BASE_URL")

model = init_chat_model(
    model_name,
    model_provider="openai",
    base_url=base_url,
    api_key="lm-studio"
    )

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chomksy-db", 
)

Load, Split, Embed, Store

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langchain_community.document_loaders import WebBaseLoader
import re
from datetime import datetime

ARTICLES_INDEX_URL = "https://chomsky.info/articles/"

resp = requests.get(ARTICLES_INDEX_URL)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")


# Match full dates like "May 12 2023" or "March 27, 2023"
date_regex = re.compile(
    r"\b("
    r"January|February|March|April|May|June|July|August|September|October|November|December"
    r")\s+\d{1,2},?\s+\d{4}\b"
)

articles = []
for li in soup.select("#main_container > ul > li"):
    a = li.find("a")
    if not a:
        continue
    title = a.get_text(strip=True)
    href = urljoin(ARTICLES_INDEX_URL, a.get("href", "").strip())

    # Full text of the <li>
    li_text = li.get_text(" ", strip=True)

    # Try to find the full date in the text
    m = date_regex.search(li_text)
    raw_date = m.group(0) if m else None

    # (Optional) normalize to ISO "YYYY-MM-DD"
    iso_date = None
    if raw_date:
        cleaned = raw_date.replace(",", "")
        try:
            iso_date = datetime.strptime(cleaned, "%B %d %Y").date().isoformat()
        except ValueError:
            pass

    articles.append(
        {
            "title": title,
            "url": href,
            "raw_date": raw_date,
            "date": iso_date or raw_date,  # keep something useful in "date"
        }
    )

print(f"Found {len(articles)} articles")

# 3. Load full article pages
article_urls = [a["url"] for a in articles]
loader = WebBaseLoader(article_urls)
docs = loader.load()

# 4. Attach title and date (and any other metadata) to each Document
title_by_url = {a["url"]: a["title"] for a in articles}
date_by_url = {a["url"]: a.get("date") for a in articles}

for doc in docs:
    url = doc.metadata.get("source") or doc.metadata.get("url")
    if url in title_by_url:
        doc.metadata["title"] = title_by_url[url]
    if url in date_by_url and date_by_url[url]:
        doc.metadata["date"] = date_by_url[url]
    # You can also tag type, etc.
    doc.metadata.setdefault("type", "article")

total_chars = sum(len(doc.page_content) for doc in docs)
print(f"Loaded {len(docs)} documents, total characters: {total_chars}")

Found 328 articles
Loaded 328 documents, total characters: 14497620


In [5]:

# chunk_overlap specifies the number of characters of overlap between consecutive text chunks,
# which helps preserve context between splits.
# add_start_index, when set to True, includes the start index of the chunk in the split metadata.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,     
    add_start_index=True  
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks")

Split into 21593 chunks


In [6]:
import json
from pathlib import Path

output_path = Path("chomsky_chunks.jsonl")

with output_path.open("w", encoding="utf-8") as f:
    for i, doc in enumerate(all_splits):
        record = {
            "id": i,
            "url": doc.metadata.get("source") or doc.metadata.get("url"),
            "article_title": doc.metadata.get("title"),
            "article_date": doc.metadata.get("date"),
            "article_source": doc.metadata.get("source"),
            "chunk_index": doc.metadata.get("start_index"),
            "content": doc.page_content,
            "metadata": doc.metadata,
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

output_path

PosixPath('chomsky_chunks.jsonl')

In [7]:
# Add documents in batches to avoid exceeding Chroma's max batch size
batch_size = 5000  
document_ids = []

for i in range(0, len(all_splits), batch_size):
    batch = all_splits[i:i + batch_size]
    batch_ids = vector_store.add_documents(batch)
    document_ids.extend(batch_ids)
    print(f"Added batch {i//batch_size + 1}/{(len(all_splits) + batch_size - 1)//batch_size} ({len(batch)} documents)")

print(f"\nTotal documents added: {len(document_ids)}")

Added batch 1/5 (5000 documents)
Added batch 2/5 (5000 documents)
Added batch 3/5 (5000 documents)
Added batch 4/5 (5000 documents)
Added batch 5/5 (1593 documents)

Total documents added: 21593


Construct the agent

Test the agent

In [23]:
from textwrap import dedent


def get_chomsky_context(query: str, k: int = 50):
    """Retrieve context chunks from the vector store for a user query."""
    docs = vector_store.similarity_search(query, k=k)
    context = "\n\n".join(
        f"[DOC {i}] {doc.metadata.get('title')} ({doc.metadata.get('date')}):\n{doc.page_content}"
        for i, doc in enumerate(docs, start=1)
    )
    return context, docs


def answer_with_rag(
    question: str,
    search_query: str | None = None,
    k: int = 4,
    use_exact: bool = False,
):
    """Retrieval-augmented generation using the Chroma vector store.

    If use_exact=True and search_query is given, pull chunks whose text
    contains search_query directly from Chroma via where_document.
    Otherwise, fall back to similarity_search.
    """
    if use_exact and search_query:
        res = vector_store.get(where_document={"$contains": search_query})
        texts = res.get("documents", [])
        metas = res.get("metadatas", [])
        docs = list(zip(texts, metas))
        context = "\n\n".join(
            f"[DOC {i}] {meta.get('title')} ({meta.get('date')}):\n{text}"
            for i, (text, meta) in enumerate(docs, start=1)
        )
    else:
        effective_query = search_query or question
        docs = vector_store.similarity_search(effective_query, k=k)
        context = "\n\n".join(
            f"[DOC {i}] {doc.metadata.get('title')} ({doc.metadata.get('date')}):\n{doc.page_content}"
            for i, doc in enumerate(docs, start=1)
        )

    prompt = dedent(
        f"""
        You are an assistant that MUST answer ONLY using the context below.
        If the answer is not clearly contained in the context, say exactly:
        "I don't know based on the provided context."

        Do NOT use outside knowledge. Do NOT make up titles, quotes, or sources.

        Context:
        {context}

        Question: {question}

        Answer (using ONLY the context above):
        """
    )

    response = model.invoke(prompt)
    return response, docs


def list_articles_mentioning_in_chroma(term: str):
    """Return unique articles from the Chroma store whose document text contains `term`."""
    # This uses Chroma's where_document substring filter
    res = vector_store.get(where_document={"$contains": term})

    docs = res.get("documents", [])
    metas = res.get("metadatas", [])
    ids = res.get("ids", [])

    articles = {}
    for doc, meta, _id in zip(docs, metas, ids):
        source = meta.get("source") or meta.get("url") or _id
        if source not in articles:
            articles[source] = {
                "url": source,
                "title": meta.get("title"),
                "date": meta.get("date"),
                "chunk_hits": 0,
            }
        articles[source]["chunk_hits"] += 1

    return list(articles.values())


resp, docs = answer_with_rag(
    question="What did Chomsky say about Jubilee 2000?",
    search_query="Jubilee 2000",
    use_exact=True,
)
print(resp.content)

Chomsky said that the Jubilee 2000 call for debt cancellation is "welcome and merits support, but is open to some qualifications." He also mentioned that the debt does not go away, and that someone pays, with the historical record showing that risks are socialized in the system of "free enterprise capitalism".
