In [None]:
# | default_exp _components.embeddings

In [None]:
# | export

from typing import *
import shutil
import zipfile
from tempfile import TemporaryDirectory
import requests
from contextlib import contextmanager
from pathlib import Path

from langchain.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader
from langchain.schema.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from yaspin import yaspin
import typer


from faststream_gen._code_generator.constants import (
    FASTSTREAM_REPO_ZIP_URL,
    FASTSTREAM_DOCS_DIR_SUFFIX,
    FASTSTREAM_EXAMPLES_DIR_SUFFIX,
    FASTSTREAM_EXAMPLE_FILES,
    FASTSTREAM_TMP_DIR_PREFIX
)
from faststream_gen._components.package_data import get_root_data_path

In [None]:
from typer.testing import CliRunner

In [None]:
# | export


def _fetch_content(url: str) -> requests.models.Response:
    """Fetch content from a URL using an HTTP GET request.

    Args:
        url (str): The URL to fetch content from.

    Returns:
        Response: The response object containing the content and HTTP status.

    Raises:
        requests.exceptions.Timeout: If the request times out.
        requests.exceptions.RequestException: If an error occurs during the request.
    """
    try:
        response = requests.get(url, timeout=50)
        response.raise_for_status()  # Raises an exception for HTTP errors
        return response
    except requests.exceptions.Timeout:
        raise requests.exceptions.Timeout(
            "Request timed out. Please check your internet connection or try again later."
        )
    except requests.exceptions.RequestException as e:
        raise requests.exceptions.RequestException(f"An error occurred: {e}")

In [None]:
response = _fetch_content("https://fastkafka.airt.ai/")
print(response.content[:200])
assert len(response.content) > 0

b'<!doctype html>\n<html lang="en" dir="ltr" class="plugin-pages plugin-id-default">\n<head>\n<meta charset="UTF-8">\n<meta name="generator" content="Docusaurus v2.4.0">\n<title data-rh="true">Effortless Kaf'


In [None]:
# | export


def _create_documents(
    extrated_path: Path, extension: str = "**/*.md"
) -> List[Document]:
    """Create a List of document objects from Files.

    Args:
        extracted_path (Path): The path to the directory containing the files to be
            loaded as documents.
        extension (str, optional): The file extension pattern to match. Defaults to
            "**/*.md" to match Markdown files in all subdirectories.

    Returns:
        List[Document]: A list of documents created from the loaded files.
    """
    loader = DirectoryLoader(
        str(extrated_path), glob=extension, loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()

In [None]:
fixture_description = """
Create a FastStream application using localhost broker for testing and use the default port number. 
It should consume messages from the "input_data" topic, where each message is a JSON encoded object containing a single attribute: 'data'. 
For each consumed message, create a new message object and increment the value of the data attribute by 1. Finally, send the modified message to the 'output_data' topic.
"""

with TemporaryDirectory() as d:
    input_path = Path(d) / "input_path"
    input_path.mkdir(parents=True, exist_ok=True)
    output_path = Path(d) / "output_path"
    output_path.mkdir(parents=True, exist_ok=True)
    
    with open(f"{input_path}/sample.txt", "w") as f:
        f.write(fixture_description)
    
    docs = _create_documents(input_path, "**/*.txt")

    print(len(docs))
    assert len(docs) > 0
    assert isinstance(docs[0], Document)

    print(docs[0].page_content[:200])

1
Create a FastStream application using localhost broker for testing and use the default port number. 
It should consume messages from the "input_data" topic, where each message is a JSON encoded object


In [None]:

# | export

def _split_document_into_chunks(
    documents: List[Document],
    separator: str,
    chunk_size: int = 500,
    chunk_overlap: int = 0,
) -> List[Document]:
    """Split the list of documents into chunks

    Args:
        documents: List of documents to be split into chunks.
        separators: List of separator patterns used for chunking.
        chunk_size: The maximum size of each chunk in characters. Defaults to 1500.
        chunk_overlap: The overlap between consecutive chunks in characters. Defaults to 150.

    Returns:
        A list of documents where each document represents a chunk.
    """
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separator=separator
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

In [None]:
doc_chunks = _split_document_into_chunks(docs, "\n\n")
print(len(doc_chunks))
assert len(doc_chunks) >= len(docs)

1


In [None]:

# | export

def _save_embeddings_db(doc_chunks: List[Document], db_path: str) -> None:
    """Save the embeddings in a FAISS db
    
    Args:
        doc_chunks: A list of documents where each document represents a chunk.
        db_path: Path to save the FAISS db.
    """
    db = FAISS.from_documents(doc_chunks, OpenAIEmbeddings()) # type: ignore
    db.save_local(db_path)

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/faiss_index"
    _save_embeddings_db(docs, db_path)
    
    !ls -la {d}
    assert (Path(d) / "faiss_index" / "index.faiss").exists()

total 20
drwx------ 3 harish harish  4096 Sep  7 15:29 .
drwxrwxrwt 1 root   root   12288 Sep  7 15:29 ..
drwxrwxr-x 2 harish harish  4096 Sep  7 15:29 faiss_index


In [None]:
# | export

def _delete_directory(directory_path: Path) -> None:
    """Delete a directory and its contents if it exists.

    Args:
        directory_path: The path to the directory to be deleted.
    """
    if directory_path.exists():
        try:
            shutil.rmtree(directory_path)
        except Exception as e:
            print(f"Error deleting directory: {e}")

In [None]:
with TemporaryDirectory() as d:
    some_dir = Path(f"{d}/some_dir")
    some_dir.mkdir()
    assert some_dir.exists()
    !ls -la {d}

    _delete_directory(some_dir)
    assert not some_dir.exists()
    
    non_existing_dir = Path(f"{d}/non_existing_dir")
    _delete_directory(non_existing_dir)
    !ls -la {d}

total 20
drwx------ 3 harish harish  4096 Sep  7 15:29 .
drwxrwxrwt 1 root   root   12288 Sep  7 15:29 ..
drwxrwxr-x 2 harish harish  4096 Sep  7 15:29 some_dir
total 16
drwx------ 2 harish harish  4096 Sep  7 15:29 .
drwxrwxrwt 1 root   root   12288 Sep  7 15:29 ..


In [None]:
# | export


def _generate_docs_db(input_path: Path, output_path: Path) -> None:
    """Generate Document Embeddings Database.

    This function creates document embeddings for a collection of documents
    located in the specified input directory and saves the embeddings database
    to the specified output directory.

    Args:
        input_path (Path): The path to the directory containing input documents.
        output_path (Path): The path to the directory where the embeddings
            database will be saved.
    """
    with yaspin(
        text="Creating embeddings for the docs...", color="cyan", spinner="clock"
    ) as sp:
        docs = _create_documents(input_path)
        _save_embeddings_db(docs, output_path)

        sp.text = ""
        sp.ok(f" ✔ Docs embeddings created and saved to: {output_path}")

In [None]:
with TemporaryDirectory() as d:
    input_path = Path(d) / "input_path"
    input_path.mkdir(parents=True, exist_ok=True)
    output_path = Path(d) / "output_path"
    output_path.mkdir(parents=True, exist_ok=True)
    
    with open(f"{input_path}/sample.md", "w") as f:
        f.write("# Hello world!")
    
    _generate_docs_db(input_path, output_path)
    
    assert (output_path / "index.faiss").exists()

⠹ Creating embeddings for the docs... 

  self._color = self._set_color(color) if color else color


 ✔ Docs embeddings created and saved to: /tmp/tmpcs7jr7x9/output_path 


In [None]:
# | export


def _check_all_files_exist(d: Path, required_files: List[str]) -> bool:
    """Check if all required files exist in a directory.

    Args:
        d (Path): The path to the directory where the existence of files will
            be checked.
        required_files (List[str]): A list of filenames that should exist in
            the directory.

    Returns:
        True if all required files exist in the directory, False otherwise.
    """
    return all((d / file_name).exists() for file_name in required_files)

In [None]:
with TemporaryDirectory() as d:
    required_files = ['description.txt', 'app_skeleton.py']
    
    with open(f"{d}/description.txt", "w") as f:
        f.write("description.txt")
        
    with open(f"{d}/app_skeleton.py", "w") as f:
        f.write("app_skeleton.py")
        
    
    actual = _check_all_files_exist(Path(d), required_files)
    print(actual)
    assert actual

True


In [None]:
with TemporaryDirectory() as d:
    required_files = ['description.txt', 'app_skeleton.py', 'app.py', 'test_app.py']
    
    with open(f"{d}/description.txt", "w") as f:
        f.write("description.txt")
        
    with open(f"{d}/app_skeleton.py", "w") as f:
        f.write("app_skeleton.py")
        
    
    actual = _check_all_files_exist(Path(d), required_files)
    print(actual)
    assert not actual

False


In [None]:
# | export


def _append_file_contents(d: Path, parent_d: Path, required_files: List[str]):
    """Append contents of specified files to a result file.

    This function appends the contents of the specified list of files to a
    result file in a designated directory.

    Args:
        d (Path): The path to the directory containing the files to be appended.
        parent_d (Path): The parent directory where the result file will be created.
        required_files (List[str]): A list of filenames to be appended.
    """
    appended_examples_dir = parent_d / FASTSTREAM_TMP_DIR_PREFIX
    appended_examples_dir.mkdir(parents=True, exist_ok=True)

    result_file_name = appended_examples_dir / f"{d.name}.txt"

    with result_file_name.open("a") as result_file:
        for file_name in required_files:
            with (d / file_name).open("r") as file:
                result_file.write(
                    f"==== {file_name} starts ====\n{file.read()}\n==== {file_name} ends ====\n"
                )

In [None]:
fixture_description = """
Create a FastStream application using localhost broker for testing and use the default port number. 
It should consume messages from the "input_data" topic, where each message is a JSON encoded object containing a single attribute: 'data'. 
For each consumed message, create a new message object and increment the value of the data attribute by 1. Finally, send the modified message to the 'output_data' topic.
"""

def _create_example_structure(directory: Path, required_files: List[str]):
    for file_name in required_files:
        if file_name == "description.txt":
            with open(directory / file_name, "w") as f:
                f.write(fixture_description)
        else:
            with open(directory / file_name, "w") as f:
                f.write(file_name)

with TemporaryDirectory() as d:
    required_files = ['description.txt', 'app_skeleton.py', 'app.py', 'test_app.py']
    
    example_1 = Path(d) / "example_1"
    example_1.mkdir(parents=True, exist_ok=True)
    
    _create_example_structure(example_1, required_files)
        
    for directory in Path(d).iterdir():
        _append_file_contents(directory, Path(d), required_files)
        
    with open(f"{d}/{FASTSTREAM_TMP_DIR_PREFIX}/example_1.txt", "r") as f:
        actual = f.read()
        
    print(actual)
    expected = f"""==== description.txt starts ====
{fixture_description}
==== description.txt ends ====
==== app_skeleton.py starts ====
app_skeleton.py
==== app_skeleton.py ends ====
==== app.py starts ====
app.py
==== app.py ends ====
==== test_app.py starts ====
test_app.py
==== test_app.py ends ====
"""
    
    assert actual == expected    

==== description.txt starts ====

Create a FastStream application using localhost broker for testing and use the default port number. 
It should consume messages from the "input_data" topic, where each message is a JSON encoded object containing a single attribute: 'data'. 
For each consumed message, create a new message object and increment the value of the data attribute by 1. Finally, send the modified message to the 'output_data' topic.

==== description.txt ends ====
==== app_skeleton.py starts ====
app_skeleton.py
==== app_skeleton.py ends ====
==== app.py starts ====
app.py
==== app.py ends ====
==== test_app.py starts ====
test_app.py
==== test_app.py ends ====



In [None]:
# | export


def _format_examples(input_path: Path, required_files: List[str]) -> None:
    """Format Examples by Appending File Contents.

    This function iterates through directories in the specified input path and checks
    if all the required files exist in each directory. If the required files are present,
    it appends their contents to a result file within the input path. If any of the
    required files are missing, it skips the directory and logs a message.

    Args:
        input_path (Path): The path to the directory containing example directories
            with files to be appended.
        required_files (List[str]): A list of filenames that must exist in each example
            directory.
    """
    for directory in input_path.iterdir():
        if directory.is_dir() and _check_all_files_exist(directory, required_files):
            _append_file_contents(directory, input_path, required_files)
        else:
            typer.echo(f"\nRequired files are missing. Skipping directory: {directory}")


def _generate_examples_db(
    input_path: Path,
    output_path: Path,
    required_files: List[str] = FASTSTREAM_EXAMPLE_FILES,
) -> None:
    """Generate Example Embeddings Database.

    This function creates embeddings for a collection of example documents located in
    the specified input directory and saves the embeddings database to the specified
    output directory. It appends the contents of specified files in each example
    directory, splits the concatenated document into chunks based on specified
    separators, and saves the embeddings for each chunk in the output database.

    Args:
        input_path (Path): The path to the directory containing example documents.
        output_path (Path): The path to the directory where the embeddings database
            will be saved.
        required_files (List[str]): A list of filenames that must exist in each
            example directory. Defaults to FASTSTREAM_EXAMPLE_FILES.
    """
    with yaspin(
        text="Creating embeddings for the examples...", color="cyan", spinner="clock"
    ) as sp:
        
        _format_examples(input_path, required_files)
        docs = _create_documents(
            input_path / FASTSTREAM_TMP_DIR_PREFIX, extension="*.txt"
        )
        doc_chunks = _split_document_into_chunks(
            docs, separator="==== description.txt ends ===="
        )
        _save_embeddings_db(doc_chunks, output_path)

        sp.text = ""
        sp.ok(f" ✔ Examples embeddings created and saved to: {output_path}")

In [None]:
required_files = ['description.txt', 'app_skeleton.py', 'app.py', 'test_app.py']

with TemporaryDirectory() as d:
    example_1 = Path(d) / "example_1"
    example_1.mkdir(parents=True, exist_ok=True)
    _create_example_structure(example_1, required_files)
    
    output_path = Path(d) / "output_path"
    output_path.mkdir(parents=True, exist_ok=True)
    
    _generate_examples_db(Path(d), output_path)
    
    assert (output_path / "index.faiss").exists()

⠋ Creating embeddings for the examples...
Required files are missing. Skipping directory: /tmp/tmp6dntvd_4/output_path
 ✔ Examples embeddings created and saved to: /tmp/tmp6dntvd_4/output_path 


In [None]:
# | export

app = typer.Typer(
    short_help="Download the zipped FastKafka documentation markdown files, generate embeddings, and save them in a vector database.",
)

In [None]:
# | export

@contextmanager
def _download_and_extract_faststream_archive():
    with TemporaryDirectory() as d:
        try:
            typer.echo(f"Downloading docs and examples from FastStream repo and generating embeddings.")
            input_path = Path(f"{d}/archive.zip")
            extrated_path = Path(f"{d}/extrated_path")
            extrated_path.mkdir(parents=True, exist_ok=True)

            response = _fetch_content(FASTSTREAM_REPO_ZIP_URL)

            with open(input_path, "wb") as f:
                f.write(response.content)

            with zipfile.ZipFile(input_path, "r") as zip_ref:
                for member in zip_ref.namelist():
                    zip_ref.extract(member, extrated_path)

            yield extrated_path

        except Exception as e:
            fg = typer.colors.RED
            typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
            raise typer.Exit(code=1)

In [None]:
# | export

@app.command(
    "generate",
    help="Download the docs and examples from FastStream repo, generate embeddings, and save them in a vector database.",
)
def generate(
    db_path: str = typer.Option(
        get_root_data_path(), 
        "--db_path",
        "-p",
        help="The path to save the vector database."
    )
) -> None:
    with _download_and_extract_faststream_archive() as extracted_path:
        try:
            db_path = Path(db_path)
            _delete_directory(db_path)
            _generate_docs_db(extracted_path/FASTSTREAM_DOCS_DIR_SUFFIX, db_path/"docs")
            _generate_examples_db(extracted_path/FASTSTREAM_EXAMPLES_DIR_SUFFIX, db_path/"examples")

            typer.echo(f"\nSuccessfully generated all the embeddings and saved to: {db_path}")
        except Exception as e:
            fg = typer.colors.RED
            typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
            raise typer.Exit(code=1)

In [None]:
runner = CliRunner()
result = runner.invoke(app, ["generate", "--help"])

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}"
    result = runner.invoke(app, ["-p", db_path])

    print(result.output)
    assert result.exit_code == 0
    assert (Path(d) / "docs" / "index.faiss").exists()
    assert (Path(d) / "examples" / "index.faiss").exists()

Downloading docs and examples from FastStream repo and generating embeddings.
⠋ Creating embeddings for the docs...                                      ⠙ Creating embeddings for the docs...                                      ⠹ Creating embeddings for the docs...                                      ⠸ Creating embeddings for the docs...                                      ⠼ Creating embeddings for the docs...                                      ⠴ Creating embeddings for the docs...                                      ⠦ Creating embeddings for the docs...                                      ⠧ Creating embeddings for the docs...                                      ⠇ Creating embeddings for the docs...                                      ⠏ Creating embeddings for the docs...                                      ⠋ Creating embeddings for the docs...                                      ⠙ Creating embeddings for the docs...                       