In [None]:
# | default_exp _components.embeddings

In [None]:
# | export

from typing import *
import shutil
import zipfile
from tempfile import TemporaryDirectory
import requests
from contextlib import contextmanager
from pathlib import Path

from langchain.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from yaspin import yaspin
import typer


from faststream_gen._code_generator.constants import FASTSTREAM_REPO_ZIP_URL, FASTSTREAM_DOCS_DIR_SUFFIX, FASTSTREAM_EXAMPLES_DIR_SUFFIX
from faststream_gen._components.package_data import get_root_data_path

In [None]:
from typer.testing import CliRunner

In [None]:
# | export


def _fetch_content(url: str) -> requests.models.Response:
    try:
        response = requests.get(url, timeout=50)
        response.raise_for_status()  # Raises an exception for HTTP errors
        return response
    except requests.exceptions.Timeout:
        raise requests.exceptions.Timeout(
            "Request timed out. Please check your internet connection or try again later."
        )
    except requests.exceptions.RequestException as e:
        raise requests.exceptions.RequestException(f"An error occurred: {e}")

In [None]:
response = _fetch_content("https://fastkafka.airt.ai/")
print(response.content[:200])
assert len(response.content) > 0

b'<!doctype html>\n<html lang="en" dir="ltr" class="plugin-pages plugin-id-default">\n<head>\n<meta charset="UTF-8">\n<meta name="generator" content="Docusaurus v2.4.0">\n<title data-rh="true">Effortless Kaf'


In [None]:
# | export


def _create_documents(extrated_path: Path, extension: str = "**/*.md") -> List[Document]:
    loader = DirectoryLoader(
        str(extrated_path), glob=extension, loader_cls=UnstructuredMarkdownLoader
    )
    return loader.load()

In [None]:
with TemporaryDirectory() as d:
    input_path = Path(d) / "input_path"
    input_path.mkdir(parents=True, exist_ok=True)
    output_path = Path(d) / "output_path"
    output_path.mkdir(parents=True, exist_ok=True)
    
    with open(f"{input_path}/sample.txt", "w") as f:
        f.write("Hello world!")
    
    docs = _create_documents(input_path, "**/*.txt")

    print(len(docs))
    assert len(docs) > 0
    assert isinstance(docs[0], Document)

    print(docs[0].page_content[:200])

1
Hello world!


In [None]:

# # | export

# def _split_document_into_chunks(
#     documents: List[Document],
#     # Limiting the max token(input) limit to 8k to be on safer side. 1 token ~= 4 chars in English. We would like to retreive top 2 matches. 
#     # so each matches can only have 8k / 2 = 4k tokens (~ 4 * 4 = 16,000 characters)    
#     # Note: chunk_size is the maximum allowed characters in each chunk. In reality not all the chunks will have 16k tokens, some will be much less than 16k.
#     # Reference: https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter
#     chunk_size: int = 16000,
#     chunk_overlap: int = 200, # 50 tokens
#     separators: List[str] = ["\n\n", "\n", "(?<=\. )", " ", ""],
# ) -> List[Document]:
#     """Split the list of documents into chunks

#     Args:
#         documents: List of documents to be split into chunks.
#         chunk_size: The maximum size of each chunk in characters. Defaults to 1500.
#         chunk_overlap: The overlap between consecutive chunks in characters. Defaults to 150.
#         separators: List of separator patterns used for chunking. Defaults to ["\n\n", "\n", "(?<=\. )", " ", ""].

#     Returns:
#         A list of documents where each document represents a chunk.
#     """
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=chunk_size,
#         chunk_overlap=chunk_overlap,
#         separators=separators
#     )
#     chunks = text_splitter.split_documents(documents)
#     return chunks

In [None]:
# doc_chunks = _split_document_into_chunks(docs)
# print(len(doc_chunks))
# assert len(doc_chunks) >= len(docs)

In [None]:

# | export

def _save_embeddings_db(doc_chunks: List[Document], db_path: str) -> None:
    """Save the embeddings in a FAISS db
    
    Args:
        doc_chunks: A list of documents where each document represents a chunk.
        db_path: Path to save the FAISS db.
    """
    db = FAISS.from_documents(doc_chunks, OpenAIEmbeddings()) # type: ignore
    db.save_local(db_path)

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}/faiss_index"
    _save_embeddings_db(docs, db_path)
    
    !ls -la {d}
    assert (Path(d) / "faiss_index" / "index.faiss").exists()

total 20
drwx------ 3 harish harish  4096 Sep  7 07:56 .
drwxrwxrwt 1 root   root   12288 Sep  7 07:56 ..
drwxrwxr-x 2 harish harish  4096 Sep  7 07:56 faiss_index


In [None]:
# | export

def _delete_directory(directory_path: Path) -> None:
    """Delete a directory and its contents if it exists.

    Args:
        directory_path: The path to the directory to be deleted.
    """
    if directory_path.exists():
        try:
            shutil.rmtree(directory_path)
        except Exception as e:
            print(f"Error deleting directory: {e}")

In [None]:
with TemporaryDirectory() as d:
    some_dir = Path(f"{d}/some_dir")
    some_dir.mkdir()
    assert some_dir.exists()
    !ls -la {d}

    _delete_directory(some_dir)
    assert not some_dir.exists()
    
    non_existing_dir = Path(f"{d}/non_existing_dir")
    _delete_directory(non_existing_dir)
    !ls -la {d}

total 20
drwx------ 3 harish harish  4096 Sep  7 07:56 .
drwxrwxrwt 1 root   root   12288 Sep  7 07:56 ..
drwxrwxr-x 2 harish harish  4096 Sep  7 07:56 some_dir
total 16
drwx------ 2 harish harish  4096 Sep  7 07:56 .
drwxrwxrwt 1 root   root   12288 Sep  7 07:56 ..


In [None]:
# | export

def _generate_docs_db(input_path: Path, output_path: Path) -> None:
    with yaspin(
        text="Creating embeddings for the docs...", color="cyan", spinner="clock"
    ) as sp:
        docs = _create_documents(input_path)
        _save_embeddings_db(docs, output_path)
        
        sp.text = ""
        sp.ok(f" ✔ Docs embeddings created and saved to: {output_path}")

In [None]:
with TemporaryDirectory() as d:
    input_path = Path(d) / "input_path"
    input_path.mkdir(parents=True, exist_ok=True)
    output_path = Path(d) / "output_path"
    output_path.mkdir(parents=True, exist_ok=True)
    
    with open(f"{input_path}/sample.md", "w") as f:
        f.write("# Hello world!")
    
    _generate_docs_db(input_path, output_path)
    
    assert (output_path / "index.faiss").exists()

In [None]:
# | export

def _generate_examples_db(input_path: Path, output_path: Path) -> None:
    pass

In [None]:
# todo: write tests
#             docs = _create_documents()
    #         # Experimenting by commenting out chunking, so each guide will be treated as a single document and will be sent in its entirety along with the prompt.
    #         # doc_chunks = _split_document_into_chunks(docs)
    #         # _save_embeddings_db(doc_chunks, db_path)
    #         _save_embeddings_db(docs, db_path)


In [None]:
# | export

app = typer.Typer(
    short_help="Download the zipped FastKafka documentation markdown files, generate embeddings, and save them in a vector database.",
)

In [None]:
# | export

@contextmanager
def _download_and_extract_faststream_archive():
    with TemporaryDirectory() as d:
        try:
            typer.echo(f"Downloading docs and examples from FastStream repo and generating embeddings.")
            input_path = Path(f"{d}/archive.zip")
            extrated_path = Path(f"{d}/extrated_path")
            extrated_path.mkdir(parents=True, exist_ok=True)

            response = _fetch_content(FASTSTREAM_REPO_ZIP_URL)

            with open(input_path, "wb") as f:
                f.write(response.content)

            with zipfile.ZipFile(input_path, "r") as zip_ref:
                for member in zip_ref.namelist():
                    zip_ref.extract(member, extrated_path)

            yield extrated_path

        except Exception as e:
            fg = typer.colors.RED
            typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
            raise typer.Exit(code=1)

In [None]:
# | export

@app.command(
    "generate",
    help="Download the docs and examples from FastStream repo, generate embeddings, and save them in a vector database.",
)
def generate(
    db_path: str = typer.Option(
        get_root_data_path(), 
        "--db_path",
        "-p",
        help="The path to save the vector database."
    )
) -> None:
    with _download_and_extract_faststream_archive() as extracted_path:
        try:
            db_path = Path(db_path)
            _delete_directory(db_path)
            _generate_docs_db(extracted_path/FASTSTREAM_DOCS_DIR_SUFFIX, db_path/"docs")
            _generate_examples_db(extracted_path/FASTSTREAM_EXAMPLES_DIR_SUFFIX, db_path/"examples")

            typer.echo(f"\nSuccessfully generated all the embeddings and saved to: {db_path}")
        except Exception as e:
            fg = typer.colors.RED
            typer.secho(f"Unexpected internal error: {e}", err=True, fg=fg)
            raise typer.Exit(code=1)

In [None]:
runner = CliRunner()
result = runner.invoke(app, ["generate", "--help"])

In [None]:
with TemporaryDirectory() as d:
    db_path = f"{d}"
    result = runner.invoke(app, ["generate", "-p", db_path])

    print(result.output)
    assert result.exit_code == 0
    assert (Path(d) / "docs" / "index.faiss").exists()

Downloading docs and examples from FastStream repo and generating embeddings.
/tmp/tmp1rsqd38d

Successfully generated all the embeddings and saved to: /tmp/tmp1rsqd38d

