In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
import bs4
from typing import List
import requests
from bs4 import BeautifulSoup
from langchain.agents import create_agent
from langchain.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
model_name = os.getenv("LM_MODEL")
base_url = os.getenv("OPENAI_BASE_URL")

model = init_chat_model(
    model_name,
    model_provider="openai",
    base_url=base_url,
    api_key="lm-studio"
    )

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chomksy-db", 
)

Load, Split, Embed, Store

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langchain_community.document_loaders import WebBaseLoader
import re
from datetime import datetime

ARTICLES_INDEX_URL = "https://chomsky.info/articles/"

resp = requests.get(ARTICLES_INDEX_URL)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")


# Match full dates like "May 12 2023" or "March 27, 2023"
date_regex = re.compile(
    r"\b("
    r"January|February|March|April|May|June|July|August|September|October|November|December"
    r")\s+\d{1,2},?\s+\d{4}\b"
)

articles = []
for li in soup.select("#main_container > ul > li"):
    a = li.find("a")
    if not a:
        continue
    title = a.get_text(strip=True)
    href = urljoin(ARTICLES_INDEX_URL, a.get("href", "").strip())

    # Full text of the <li>
    li_text = li.get_text(" ", strip=True)

    # Try to find the full date in the text
    m = date_regex.search(li_text)
    raw_date = m.group(0) if m else None

    # (Optional) normalize to ISO "YYYY-MM-DD"
    iso_date = None
    if raw_date:
        cleaned = raw_date.replace(",", "")
        try:
            iso_date = datetime.strptime(cleaned, "%B %d %Y").date().isoformat()
        except ValueError:
            pass

    articles.append(
        {
            "title": title,
            "url": href,
            "raw_date": raw_date,
            "date": iso_date or raw_date,  # keep something useful in "date"
        }
    )

print(f"Found {len(articles)} articles")

# 3. Load full article pages
article_urls = [a["url"] for a in articles]
loader = WebBaseLoader(article_urls)
docs = loader.load()

# 4. Attach title and date (and any other metadata) to each Document
title_by_url = {a["url"]: a["title"] for a in articles}
date_by_url = {a["url"]: a.get("date") for a in articles}

for doc in docs:
    url = doc.metadata.get("source") or doc.metadata.get("url")
    if url in title_by_url:
        doc.metadata["title"] = title_by_url[url]
    if url in date_by_url and date_by_url[url]:
        doc.metadata["date"] = date_by_url[url]
    # You can also tag type, etc.
    doc.metadata.setdefault("type", "article")

total_chars = sum(len(doc.page_content) for doc in docs)
print(f"Loaded {len(docs)} documents, total characters: {total_chars}")

Found 328 articles
Loaded 328 documents, total characters: 14497624


In [4]:

# chunk_overlap specifies the number of characters of overlap between consecutive text chunks,
# which helps preserve context between splits.
# add_start_index, when set to True, includes the start index of the chunk in the split metadata.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,     
    add_start_index=True  
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks")

Split into 21595 chunks


In [None]:
import json
from pathlib import Path

output_path = Path("chomsky_chunks.jsonl")

with output_path.open("w", encoding="utf-8") as f:
    for i, doc in enumerate(all_splits):
        record = {
            "id": i,
            "url": doc.metadata.get("source") or doc.metadata.get("url"),
            "article_title": doc.metadata.get("title"),
            "article_date": doc.metadata.get("date"),
            "article_source": doc.metadata.get("source"),
            "chunk_index": doc.metadata.get("start_index"),
            "content": doc.page_content,
            "metadata": doc.metadata,
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

PosixPath('chomsky_chunks.jsonl')

In [6]:
# Add documents in batches to avoid exceeding Chroma's max batch size
batch_size = 5000  
document_ids = []

for i in range(0, len(all_splits), batch_size):
    batch = all_splits[i:i + batch_size]
    batch_ids = vector_store.add_documents(batch)
    document_ids.extend(batch_ids)
    print(f"Added batch {i//batch_size + 1}/{(len(all_splits) + batch_size - 1)//batch_size} ({len(batch)} documents)")

print(f"\nTotal documents added: {len(document_ids)}")

Added batch 1/5 (5000 documents)
Added batch 2/5 (5000 documents)
Added batch 3/5 (5000 documents)
Added batch 4/5 (5000 documents)
Added batch 5/5 (1595 documents)

Total documents added: 21595


Test the agent

In [16]:
# Build a retriever on top of the existing Chroma vector store
# Use a higher k so we can re-rank / filter for more specific matches (e.g. "Gulf War").
retriever = vector_store.as_retriever(search_kwargs={"k": 20})

system_prompt = """You are a helpful assistant that answers questions about Noam Chomsky's
articles from `chomsky.info`.

You are given context snippets extracted from his articles. Base your answer
ONLY on this context. If the answer is not clearly supported by the context,
say you don't know and avoid guessing.

CRITICAL:
- Do NOT invent article titles, dates, or quotes.
- Only mention an article title or date if it appears explicitly in the provided context
  or if I (the system) separately list it for you.
- If the title or date is unclear, say that it is not specified.

When relevant, you may summarize the arguments in the context, but do not attribute
them to articles that are not explicitly named.

Context:
{context}

Question:
{question}
"""


def _build_context(docs, max_chars: int = 6000) -> str:
    """Concatenate document contents into a single context string.

    This avoids depending on newer LangChain chain helpers that may not be
    available in your installed version.
    """
    parts = []
    total = 0
    for doc in docs:
        text = doc.page_content.strip()
        if not text:
            continue
        if total + len(text) > max_chars:
            remaining = max_chars - total
            if remaining <= 0:
                break
            text = text[:remaining]
        parts.append(text)
        total += len(text)
        if total >= max_chars:
            break
    return "\n\n---\n\n".join(parts)


def ask_chomsky(question: str, show_sources: bool = True):
    """Ask a question about Chomsky's work using a minimal RAG pipeline.

    This implementation uses only the base LangChain primitives that are
    present even in older versions: it calls the vector-store retriever
    directly, manually builds a context string, then calls the chat model.
    """
    # 1. Retrieve relevant chunks
    # Some versions expose `.invoke`, older ones use `.get_relevant_documents`.
    try:
        docs = retriever.invoke(question)
    except AttributeError:
        docs = retriever.get_relevant_documents(question)

    # 1b. Heuristic re-filtering: if the user explicitly mentions "Gulf War",
    # prefer documents whose title or content also mention "Gulf War".
    lower_q = question.lower()
    if "gulf war" in lower_q:
        gulf_docs = []
        for d in docs:
            meta = getattr(d, "metadata", {}) or {}
            title_text = str(meta.get("title") or meta.get("article_title") or "")
            blob = (title_text + "\n" + d.page_content).lower()
            if "gulf war" in blob:
                gulf_docs.append(d)
        if gulf_docs:
            docs = gulf_docs

    # 2. Build the context string
    context = _build_context(docs)

    # 3. Build a single text prompt and ask the model
    final_prompt = system_prompt.format(context=context, question=question)
    result = model.invoke(final_prompt)

    print("\nAnswer:\n")
    # `result` is a chat message-like object; most versions expose `.content`
    print(getattr(result, "content", result))

    if show_sources:
        print("\nSources:\n")
        seen = set()
        uniq_docs = []
        for d in docs:
            meta = getattr(d, "metadata", {}) or {}
            key = meta.get("source") or meta.get("url") or id(d)
            if key in seen:
                continue
            seen.add(key)
            uniq_docs.append(d)

        for i, doc in enumerate(uniq_docs, start=1):
            meta = getattr(doc, "metadata", {}) or {}
            title = meta.get("title") or meta.get("article_title") or "(untitled)"
            date = meta.get("date") or meta.get("article_date") or "(no date)"
            url = meta.get("source") or meta.get("url") or "(no url)"
            print(f"[{i}] {title} — {date}\n    {url}")


# Example usage (already active in this cell):
ask_chomsky("What does Chomsky say about the Gulf War?")



Answer:

Based on the context, Chomsky argues that the Gulf War was not a war in the classical sense, as it did not involve direct combat between two sides. Instead, he suggests that it was a covert operation to control Arab oil and crush Arab nationalism, with the primary goal of establishing the US as the world's policeman.

Chomsky criticizes the war as unjust and wrong, arguing that the US has no right to control oil prices or administer the future of the Middle East. He also implies that the war is part of a larger pattern of US imperialism, where the country sacrifices its own population to maintain power in other parts of the world.

Chomsky's critique of the Gulf War is not primarily focused on the high costs of the war, such as lost lives or economic expenses, but rather on its underlying motivations and consequences. He suggests that liberals who oppose the war for these reasons are missing the point, which is that the war itself is morally reprehensible.

Chomsky also notes