In [None]:
# | default_exp _components.embeddings

In [None]:
# | export

from typing import *
import shutil
import tarfile
from tempfile import TemporaryDirectory
import requests
import functools
from pathlib import Path

from langchain.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import typer


from faststream_gen._code_generator.constants import FASTKAFKA_DOCS_MD_ARCHIVE_URL
from faststream_gen._components.package_data import get_root_data_path

In [None]:
from typer.testing import CliRunner

In [None]:
# | export


def _fetch_content(url: str) -> requests.models.Response:
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for HTTP errors
    except requests.exceptions.Timeout:
        print(
            "Request timed out. Please check your internet connection or try again later."
        )
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        
    return response

In [None]:
response = _fetch_content("https://fastkafka.airt.ai/")
print(response.content[:200])
assert len(response.content) > 0

b'<!doctype html>\n<html lang="en" dir="ltr" class="plugin-pages plugin-id-default">\n<head>\n<meta charset="UTF-8">\n<meta name="generator" content="Docusaurus v2.4.0">\n<title data-rh="true">Effortless Kaf'


In [None]:
# | export

def _download_and_extract_website_archive(func: Callable) -> Callable:
    """Download the archive from the given url, extract the contents, and yields the extraction path.

    Args:
        func: The function to be wrapped.

    Returns:
        A decorator function that downloads the archive, extracts the contents, and yields the extraction path.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs): # type: ignore
        with TemporaryDirectory() as d:            
            input_path = Path(f"{d}/archive.tar.gz")
            extrated_md_files_path = Path(f"{d}/extrated_md_files_path")
            
            response = _fetch_content(FASTKAFKA_DOCS_MD_ARCHIVE_URL)
            
            with open(input_path, "wb") as f:
                f.write(response.content)

            with tarfile.open(input_path, "r:gz") as tar: # nosemgrep
                # nosemgrep
                tar.extractall(path=extrated_md_files_path) # nosec

            return func(extrated_md_files_path, *args, **kwargs)

    return wrapper


@_download_and_extract_website_archive
def _create_documents(extrated_md_files_path: Path) -> List[Document]:
    """Create Document objects from markdown files in the given path.

    Args:
        extracted_md_files_path: Path to the extracted markdown files.

    Returns:
        A list of Document objects, one for each extracted markdown file.
    """
    loader = DirectoryLoader(
        str(extrated_md_files_path), glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()

In [None]:
docs = _create_documents()

print(len(docs))
assert len(docs) > 0
assert isinstance(docs[0], Document)

print(docs[0].page_content[:200])

38
Contributing to FastKafka

First off, thanks for taking the time to contribute! ❤️

All types of contributions are encouraged and valued. See the Table of Contents for different ways to help and detai


In [None]:

# | export

def _split_document_into_chunks(
    documents: List[Document],
    # Limiting the max token(input) limit to 8k to be on safer side. 1 token ~= 4 chars in English. We would like to retreive top 2 matches. 
    # so each matches can only have 8k / 2 = 4k tokens (~ 4 * 4 = 16,000 characters)    
    # Note: chunk_size is the maximum allowed characters in each chunk. In reality not all the chunks will have 16k tokens, some will be much less than 16k.
    # Reference: https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter
    chunk_size: int = 16000,
    chunk_overlap: int = 200, # 50 tokens
    separators: List[str] = ["\n\n", "\n", "(?<=\. )", " ", ""],
) -> List[Document]:
    """Split the list of documents into chunks

    Args:
        documents: List of documents to be split into chunks.
        chunk_size: The maximum size of each chunk in characters. Defaults to 1500.
        chunk_overlap: The overlap between consecutive chunks in characters. Defaults to 150.
        separators: List of separator patterns used for chunking. Defaults to ["\n\n", "\n", "(?<=\. )", " ", ""].

    Returns:
        A list of documents where each document represents a chunk.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

In [None]:
doc_chunks = _split_document_into_chunks(docs)
print(len(doc_chunks))
assert len(doc_chunks) > len(docs)

39


In [None]:

# | export

def _save_embeddings_db(doc_chunks: List[Document], db_path: str) -> None:
    """Save the embeddings in a FAISS db
    
    Args:
        doc_chunks: A list of documents where each document represents a chunk.
        db_path: Path to save the FAISS db.
    """
    db = FAISS.from_documents(doc_chunks, OpenAIEmbeddings()) # type: ignore
    db.save_local(db_path)

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/faiss_index"
    _save_embeddings_db(doc_chunks, db_path)
    
    !ls -la {d}
    assert (Path(d) / "faiss_index" / "index.faiss").exists()

total 20
drwx------ 3 harish harish  4096 Sep  6 11:04 .
drwxrwxrwt 1 root   root   12288 Sep  6 11:04 ..
drwxrwxr-x 2 harish harish  4096 Sep  6 11:04 faiss_index


In [None]:
# | export

def _delete_directory(directory_path: Path) -> None:
    """Delete a directory and its contents if it exists.

    Args:
        directory_path: The path to the directory to be deleted.
    """
    if directory_path.exists():
        try:
            shutil.rmtree(directory_path)
        except Exception as e:
            print(f"Error deleting directory: {e}")

In [None]:
with TemporaryDirectory() as d:
    some_dir = Path(f"{d}/some_dir")
    some_dir.mkdir()
    assert some_dir.exists()
    !ls -la {d}

    _delete_directory(some_dir)
    assert not some_dir.exists()
    
    non_existing_dir = Path(f"{d}/non_existing_dir")
    _delete_directory(non_existing_dir)
    !ls -la {d}

total 20
drwx------ 3 harish harish  4096 Sep  6 11:04 .
drwxrwxrwt 1 root   root   12288 Sep  6 11:04 ..
drwxrwxr-x 2 harish harish  4096 Sep  6 11:04 some_dir
total 16
drwx------ 2 harish harish  4096 Sep  6 11:04 .
drwxrwxrwt 1 root   root   12288 Sep  6 11:04 ..


In [None]:
# | export

def _get_default_vector_db_path() -> Path:
    return get_root_data_path() / "docs"

In [None]:
actual = _get_default_vector_db_path()
print(actual)

/work/fastkafka-gen/faststream_gen/package_data/docs


In [None]:
# | export

app = typer.Typer(
    short_help="Download the zipped FastKafka documentation markdown files, generate embeddings, and save them in a vector database.",
)

In [None]:
# | export


@app.command(
    "generate",
    help="Download the zipped FastKafka documentation markdown files, generate embeddings, and save them in a vector database.",
)
def generate(
    db_path: str = typer.Option(
        _get_default_vector_db_path(), 
        "--db_path",
        "-p",
        help="The path to save the vector database."
    )
) -> None:
    try:
        _delete_directory(Path(db_path))
        
        typer.echo(f"Downloading the zipped FastKafka documentation markdown files and generating embeddings.")
        docs = _create_documents()
        # Experimenting by commenting out chunking, so each guide will be treated as a single document and will be sent in its entirety along with the prompt.
        # doc_chunks = _split_document_into_chunks(docs)
        # _save_embeddings_db(doc_chunks, db_path)
        _save_embeddings_db(docs, db_path)
        
        typer.echo(f"\nSuccessfully generated the embeddings and saved to: {db_path}")
    except Exception as e:
        fg = typer.colors.RED
        typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
        raise typer.Exit(code=1)

In [None]:
runner = CliRunner()
result = runner.invoke(app, ["generate", "--help"])

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/docs"
    result = runner.invoke(app, ["-p", db_path])
    
    print(result.output)
    assert result.exit_code == 0
    assert (Path(d) / "docs" / "index.faiss").exists()

Downloading the zipped FastKafka documentation markdown files and generating embeddings.

Successfully generated the embeddings and saved to: /tmp/tmpeqpun9g4/docs

