In [2]:
import getpass
import os
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma


os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

In [7]:
model_name = os.getenv("LM_MODEL")
base_url = os.getenv("OPENAI_BASE_URL")

model = init_chat_model(
    model_name,
    model_provider="openai",
    base_url=base_url,
    api_key="lm-studio"
    )

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chomksy-db", 
)

  from .autonotebook import tqdm as notebook_tqdm


Load, Split, Embed, Store

In [15]:
import requests
from bs4 import BeautifulSoup

# Fetch the main page and parse all article links
url = "https://chomsky.info/articles/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find all links that look like article links
article_links = []
for a in soup.select(".entry-content a"):
    href = a.get("href", "")
    # Avoid anchors, archives, and index pages
    if href.startswith("https://chomsky.info/") and "/articles/" not in href and "/books/" not in href and "#" not in href and "pdf" not in href and href not in article_links:
        article_links.append(href)

# If the above yields too few, fallback to all single-article permalinks in the list
if len(article_links) < 10:
    article_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("https://chomsky.info/") and ("articles/" not in href) and ("#" not in href) and ("pdf" not in href) and href not in article_links:
            article_links.append(href)

print(f"Found {len(article_links)} article links")
loader = WebBaseLoader(article_links)

docs = loader.load()
# The main article links are under <div class="entry-content"> in a <ul> (unordered list)
# Each article link is in an <a> tag (with possibly some in blue).
# Let's parse links from that specific list, and then load those links.
from typing import List

def get_chomsky_article_links(soup) -> List[str]:
    # Find the "entry-content" div
    entry = soup.find("div", class_="entry-content")
    article_links = []
    if entry:
        # Find the first <ul> inside entry-content
        ul = entry.find("ul")
        if ul:
            for a in ul.find_all("a", href=True):
                href = a["href"]
                # Exclude PDFs and duplicates
                if href.startswith("https://chomsky.info/") and "pdf" not in href and href not in article_links:
                    article_links.append(href)
    return article_links

article_links = get_chomsky_article_links(soup)
print(f"EXACT, from main <ul>: {len(article_links)} article links found:")
for link in article_links[:5]:
    print(link)

# Optionally, fallback if less than 10 links as before
if len(article_links) < 10:
    print("Warning: <ul> found less than 10 articles; falling back to old method.")
    article_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("https://chomsky.info/") and ("articles/" not in href) and ("#" not in href) and ("pdf" not in href) and href not in article_links:
            article_links.append(href)

print(f"Final: {len(article_links)} article links.")
loader = WebBaseLoader(article_links)
docs = loader.load()

Found 310 article links
EXACT, from main <ul>: 0 article links found:
Final: 310 article links.


In [20]:
total_chars = sum(len(doc.page_content) for doc in docs)
print(f"Total characters across all pages: {total_chars}")

Total characters across all pages: 7579283


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# chunk_overlap specifies the number of characters of overlap between consecutive text chunks,
# which helps preserve context between splits.
# add_start_index, when set to True, includes the start index of the chunk in the split metadata.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,     
    add_start_index=True  
)
all_splits = text_splitter.split_documents(docs)
print(f"Split into {len(all_splits)} chunks")



Split into 11609 chunks
