In [None]:
# | default_exp _components.embeddings

In [None]:
# | export

from typing import *
from urllib.request import Request, urlopen
from urllib.parse import urlparse, urljoin
from urllib.error import HTTPError
from pathlib import Path
import shutil

from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import typer

from fastkafka_gen._components.logger import get_logger
from fastkafka_gen._components.package_data import get_root_data_path

In [None]:
from tempfile import TemporaryDirectory

from typer.testing import CliRunner

from fastkafka_gen._components.logger import suppress_timestamps

In [None]:
# | export

logger = get_logger(__name__)

In [None]:
suppress_timestamps()
logger = get_logger(__name__, level=20)
logger.info("ok")

[INFO] __main__: ok


In [None]:
# | export

def _get_all_links_from_website(start_url: str, visited: Optional[set] = None) -> Set[str]:
    """Get a set of all links (URLs) found on the given website, starting from the given start URL.
    
    Args:
        start_url: The starting URL of the website.
        visited: Optional. A set of URLs that have already been visited. Defaults to an empty set.

    Returns:
        A set of all links found on the website.
    """
    if visited is None:
        visited = set()
    try:
        req = Request(start_url)
        # nosemgrep: python.lang.security.audit.dynamic-urllib-use-detected.dynamic-urllib-use-detected
        html_page = urlopen(req) # nosec B310
        soup = BeautifulSoup(html_page, "lxml")

        base_url = urlparse(start_url).scheme + '://' + urlparse(start_url).hostname #type: ignore

        links = set()
        for link in soup.find_all('a', href=True):
            url = urljoin(base_url, link['href']).split("#")[0].strip("/")
            if urlparse(url).hostname == urlparse(start_url).hostname:
                links.add(url)

        visited.add(start_url)
        for link in links:
            if link not in visited:
                visited |= _get_all_links_from_website(link, visited)
                
    except HTTPError as e:
        logger.warning(f'Unable to parse: {e.url}')
    
    return visited

In [None]:
all_links = _get_all_links_from_website("https://fastkafka.airt.ai")
print(f"{len(all_links)=}\n\n")
display(all_links)

assert len(all_links) > 0
assert 'https://fastkafka.airt.ai/docs/CHANGELOG' in all_links
assert 'https://fastkafka.airt.ai/docs' in all_links

len(all_links)=183




{'https://fastkafka.airt.ai',
 'https://fastkafka.airt.ai/docs',
 'https://fastkafka.airt.ai/docs/0.5.0',
 'https://fastkafka.airt.ai/docs/0.5.0/CHANGELOG',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/KafkaEvent',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/encoder/avsc_to_pydantic',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/ApacheKafkaBroker',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/LocalRedpandaBroker',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/Tester',
 'https://fastkafka.airt.ai/docs/0.5.0/cli/fastkafka',
 'https://fastkafka.airt.ai/docs/0.5.0/cli/run_fastkafka_server_process',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_04_Github_Actions_Workflow',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_05_Lifespan_Handler',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_06_Benchmarking_FastKafka',
 'https://fastkafka.airt.ai/docs/0.5.0/guide

In [None]:

# | export


def _extract_latest_doc_urls(start_url: str, urls: List[str]) -> List[str]:
    """Extract latest documentation URLs from a list of URLs.

    Args:
        start_url: The URL of the documentation homepage.
        urls: A list of documentation URLs to be filtered.

    Returns:
        A new list containing only the latest version of the documentation URLs.
    """
    ret_val = []
    for url in urls:
        parts = url.split(f"{start_url}/docs/")
        if len(parts) == 1:
            ret_val.append(url)
        else:
            identifier = parts[1].split("/")[0]
            if identifier != "next" and not identifier.replace(".", "").isdigit():
                ret_val.append(url)
    ret_val = [url for url in ret_val if "/guides/" in url or url == "https://fastkafka.airt.ai/docs"]
    return ret_val

In [None]:
fixture_all_links = ['https://fastkafka.airt.ai/docs/next/api/fastkafka/KafkaEvent', 'https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/Tester', 'https://fastkafka.airt.ai/docs', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/ApacheKafkaBroker', 'https://fastkafka.airt.ai/docs/next/guides/Guide_05_Lifespan_Handler', 'https://fastkafka.airt.ai/docs/next/guides/Guide_11_Consumes_Basics', 'https://fastkafka.airt.ai/docs/next/guides/Guide_21_Produces_Basics', 'https://fastkafka.airt.ai/docs/CHANGELOG', 'https://fastkafka.airt.ai/docs/next/guides/Guide_22_Partition_Keys', 'https://fastkafka.airt.ai/docs/next/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/LocalRedpandaBroker', 'https://fastkafka.airt.ai/docs/next/CONTRIBUTING', 'https://fastkafka.airt.ai/docs/api/fastkafka/KafkaEvent', 'https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka', 'https://fastkafka.airt.ai', 'https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/encoder/avsc_to_pydantic', 'https://fastkafka.airt.ai/docs/guides/Guide_05_Lifespan_Handler', 'https://fastkafka.airt.ai/docs/next/cli/fastkafka', 'https://fastkafka.airt.ai/docs/api/fastkafka', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker', 'https://fastkafka.airt.ai/docs/api/fastkafka/encoder/avsc_to_pydantic', 'https://fastkafka.airt.ai/docs/next/guides/Guide_04_Github_Actions_Workflow', 'https://fastkafka.airt.ai/docs/guides/Guide_06_Benchmarking_FastKafka', 'https://fastkafka.airt.ai/docs/next/guides/Guide_06_Benchmarking_FastKafka', 'https://fastkafka.airt.ai/docs/next/CHANGELOG', 'https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics', 'https://fastkafka.airt.ai/docs/next/guides/Guide_30_Using_docker_to_deploy_fastkafka', 'https://fastkafka.airt.ai/docs/next/guides/Guide_31_Using_redpanda_to_test_fastkafka', 'https://fastkafka.airt.ai/docs/guides/Guide_22_Partition_Keys', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/LocalRedpandaBroker', 'https://fastkafka.airt.ai/docs/next/api/fastkafka', 'https://fastkafka.airt.ai/docs/guides/Guide_31_Using_redpanda_to_test_fastkafka', 'https://fastkafka.airt.ai/docs/cli/run_fastkafka_server_process', 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/Tester', 'https://fastkafka.airt.ai/docs/next', 'https://fastkafka.airt.ai/docs/cli/fastkafka', 'https://fastkafka.airt.ai/docs/next/cli/run_fastkafka_server_process', 'https://fastkafka.airt.ai/docs/next/LICENSE']
start_url = "https://fastkafka.airt.ai"
actual = _extract_latest_doc_urls(start_url, fixture_all_links)

display(actual)
assert len(actual) == 10, len(actual)
assert 'https://fastkafka.airt.ai/docs/next' not in actual

['https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics',
 'https://fastkafka.airt.ai/docs',
 'https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_05_Lifespan_Handler',
 'https://fastkafka.airt.ai/docs/guides/Guide_06_Benchmarking_FastKafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics',
 'https://fastkafka.airt.ai/docs/guides/Guide_22_Partition_Keys',
 'https://fastkafka.airt.ai/docs/guides/Guide_31_Using_redpanda_to_test_fastkafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow']

In [None]:
fixture = [
    "https://fastkafka.airt.ai/docs",
    'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow',
    "https://fastkafka.airt.ai/docs/CHANGELOG",
    "https://fastkafka.airt.ai/docs/next/CHANGELOG",
    "https://fastkafka.airt.ai/docs/0.6.0/CHANGELOG",
    "https://fastkafka.airt.ai/docs/0.5.0/CHANGELOG",
]
start_url = "https://fastkafka.airt.ai"
expected = ["https://fastkafka.airt.ai/docs", 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow']
actual = _extract_latest_doc_urls(start_url, fixture)
print(actual)

assert actual == expected

['https://fastkafka.airt.ai/docs', 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow']


In [None]:

# | export

def _create_documents(urls: List[str]) -> List[Document]:
    """Scrape the URLs and create a document object.
    
    Args:
        urls: A list of URLs to scrape
        
    Returns:
        A list of document object
    """
    loader = WebBaseLoader(urls)
    data = loader.load()
    return data

In [None]:
start_url = "https://fastkafka.airt.ai"
urls =  _extract_latest_doc_urls(start_url, all_links)

print(len(urls))
assert len(urls) == 14

docs = _create_documents(urls)
assert len(docs) == len(urls)
print(docs[0].page_content[:200])

14
[INFO] langchain.document_loaders.web_base: fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.





@consumes basics | FastKafka









Skip to main contentairtFastKafka0.8.0dev 🚧0.8.00.7.10.7.00.6.00.5.0DocsSearchFastKafkaGuidesWriting services@consumes basicsBatch consuming@produces basicsD


In [None]:

# | export

def _split_document_into_chunks(
    documents: List[Document],
    chunk_size: int = 1500,
    chunk_overlap: int = 150,
    separators: List[str] = ["\n\n", "\n", "(?<=\. )", " ", ""],
) -> List[Document]:
    """Split the list of documents into chunks

    Args:
        documents: List of documents to be split into chunks.
        chunk_size: The maximum size of each chunk in characters. Defaults to 1500.
        chunk_overlap: The overlap between consecutive chunks in characters. Defaults to 150.
        separators: List of separator patterns used for chunking. Defaults to ["\n\n", "\n", "(?<=\. )", " ", ""].

    Returns:
        A list of documents where each document represents a chunk.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

In [None]:
doc_chunks = _split_document_into_chunks(docs)
print(f"{len(doc_chunks)} is greater than {len(urls)}")
# print(doc_chunks)
assert len(doc_chunks) > len(urls)

187 is greater than 14


In [None]:

# | export

def _save_embeddings_db(doc_chunks: List[Document], db_path: str) -> None:
    """Save the embeddings in a FAISS db
    
    Args:
        doc_chunks: A list of documents where each document represents a chunk.
        db_path: Path to save the FAISS db.
    """
    db = FAISS.from_documents(doc_chunks, OpenAIEmbeddings()) # type: ignore
    db.save_local(db_path)

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/faiss_index"
    _save_embeddings_db(doc_chunks, db_path)
    
    !ls -la {d}
    assert (Path(d) / "faiss_index" / "index.faiss").exists()

[INFO] faiss.loader: Loading faiss with AVX2 support.
[INFO] faiss.loader: Successfully loaded faiss with AVX2 support.
total 12
drwx------ 3 harish harish 4096 Aug 30 07:58 .
drwxrwxrwt 1 root   root   4096 Aug 30 07:58 ..
drwxrwxr-x 2 harish harish 4096 Aug 30 07:58 faiss_index


In [None]:
# | export

def _delete_directory(directory_path: Path) -> None:
    """Delete a directory and its contents if it exists.

    Args:
        directory_path: The path to the directory to be deleted.
    """
    if directory_path.exists():
        try:
            shutil.rmtree(directory_path)
        except Exception as e:
            print(f"Error deleting directory: {e}")

In [None]:
with TemporaryDirectory() as d:
    some_dir = Path(f"{d}/some_dir")
    some_dir.mkdir()
    assert some_dir.exists()
    !ls -la {d}

    _delete_directory(some_dir)
    assert not some_dir.exists()
    
    non_existing_dir = Path(f"{d}/non_existing_dir")
    _delete_directory(non_existing_dir)
    !ls -la {d}

total 12
drwx------ 3 harish harish 4096 Aug 30 07:58 .
drwxrwxrwt 1 root   root   4096 Aug 30 07:58 ..
drwxrwxr-x 2 harish harish 4096 Aug 30 07:58 some_dir
total 8
drwx------ 2 harish harish 4096 Aug 30 07:58 .
drwxrwxrwt 1 root   root   4096 Aug 30 07:58 ..


In [None]:
# | export

app = typer.Typer(
    short_help="Scrape FastKafka documentation, create embeddings from extracted content, and save them in a vector database.",
)

In [None]:

# | export

DAFAULT_DB_PATH = get_root_data_path() / "docs"
DAFAULT_START_URL = "https://fastkafka.airt.ai"

@app.command(
    "generate",
    help="Scrape FastKafka documentation, create embeddings from extracted content, and save them in a vector database.",
)
def generate(
    start_url: str = typer.Option(
        DAFAULT_START_URL,
        "--start_url",
        "-u",
        help="The start_url of the website to scrape."
    ),
    db_path: str = typer.Option(
        DAFAULT_DB_PATH, 
        "--db_path",
        "-p",
        help="The path to save the vector database."
    ),
) -> None:
    try:
        _delete_directory(Path(db_path))
        
        typer.echo(f"Scrapping {start_url} (The whole process usually takes around 45 to 90 seconds...)")
        all_doc_links = list(_get_all_links_from_website(start_url))
        filtered_doc_links = _extract_latest_doc_urls(start_url, all_doc_links)
        
        typer.echo(f"Number of identified URLs for scraping: {len(filtered_doc_links)}\n")
        typer.echo(f"Scraping the below URLs:\n")
        typer.echo("\n".join(filtered_doc_links))
        
        docs = _create_documents(filtered_doc_links)
        doc_chunks = _split_document_into_chunks(docs)
        _save_embeddings_db(doc_chunks, db_path)
        
        typer.echo(f"\nWebsite embeddings have been successfully saved to: {db_path}")
    except Exception as e:
        fg = typer.colors.RED
        typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
        raise typer.Exit(code=1)

In [None]:
runner = CliRunner()
result = runner.invoke(app, ["generate", "--help"])

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/docs"
    result = runner.invoke(app, ["-p", db_path])
    assert result.exit_code == 0
    assert (Path(d) / "docs" / "index.faiss").exists()
    print(result.output)

[INFO] langchain.document_loaders.web_base: fake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.
Scrapping https://fastkafka.airt.ai (The whole process usually takes around 45 to 90 seconds...)
Number of identified URLs for scraping: 14

Scraping the below URLs:

https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics
https://fastkafka.airt.ai/docs/guides/Guide_24_Using_Multiple_Kafka_Clusters
https://fastkafka.airt.ai/docs/guides/Guide_31_Using_redpanda_to_test_fastkafka
https://fastkafka.airt.ai/docs/guides/Guide_32_Using_fastapi_to_run_fastkafka_application
https://fastkafka.airt.ai/docs/guides/Guide_23_Batch_Producing
https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka
https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka
https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics
https://fastkafka.airt.ai/docs
https://fastkafka.airt.ai/do