In [1]:
import os
import re
from pathlib import Path
from typing import List, Dict
import logging
from bs4 import BeautifulSoup, NavigableString

from config.settings import settings

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
def extract_sections(soup: BeautifulSoup, uri: str) -> List[Dict]:
        sections = soup.find_all("section")
        section_list = []
        for i, section in enumerate(sections):
            section_id = section.get("id")
            section_text = extract_text_from_section(section)
            if section_id:
                section_data = {
                    "source": f"{uri}#{section_id}",
                    "text": section_text,
                    "previous_section": section_list[i-1]['source'] if i > 0 else None,
                    "next_section": None,
                    "metadata": {
                        "page_heading": soup.find("h1").get_text().strip() if soup.find("h1") else Path(uri).stem,
                        "section_id": section_id
                    }
                }
                if i > 0:
                    section_list[i-1]['next_section'] = section_data['source']
                section_list.append(section_data)
        return section_list

In [4]:
def extract_text_from_section(section) -> str:
        texts = []
        for element in section.children:
            if isinstance(element, NavigableString):
                if element.strip():
                    texts.append(element.strip())
            elif element.name != 'section':
                texts.append(element.get_text().strip())
        return clean_text(" ".join(texts))

In [5]:
def path_to_uri(path: Path, scheme: str = "https://", domain: str = "docs.fastht.ml") -> str:
        relative_path = str(path.relative_to(settings.RAW_DATA_DIR)).replace("\\", "/")
        return scheme + domain + "/" + relative_path


def clean_text(text: str) -> str:
    # Replace multiple newlines with a single space
    return re.sub(r'\s+', ' ', text).strip()


def process_html_files(html_files_path: List[Path]) -> List[Dict]:
    docs_text = []
    for record in html_files_path:
        print("processing: ", record)
        with open(record, "r", encoding="utf-8") as html_file:
            soup = BeautifulSoup(html_file, "html.parser")
        uri = path_to_uri(path=record)
        sections = extract_sections(soup, uri)
        docs_text.append(sections)
    return docs_text

In [6]:
html_files_path = [path for path in settings.RAW_DATA_DIR.rglob("*.html") if not path.is_dir()]
docs_text = process_html_files(html_files_path)
print(f"Total documents processed: {len(docs_text)}")

processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/index.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/ref/defining_xt_component.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/ref/live_reload.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/ref/handlers.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/oauth.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/core.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/cli.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/fastapp.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/components.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/data/raw-data/api/xtend.html
processing:  /home/amrit/data-projects/llms/fasthtml-docs-bot/dat

In [7]:
docs_text[0:1]

[[{'source': 'https://docs.fastht.ml/index.html#installation',
   'text': 'Installation Since fasthtml is a Python library, you can install it with: pip install python-fasthtml In the near future, we hope to add component libraries that can likewise be installed via pip .',
   'previous_section': None,
   'next_section': 'https://docs.fastht.ml/index.html#usage',
   'metadata': {'page_heading': 'FastHTML', 'section_id': 'installation'}},
  {'source': 'https://docs.fastht.ml/index.html#usage',
   'text': 'Usage For a minimal app, create a file “main.py” as follows: main.py from fasthtml.common import * app,rt = fast_app() @rt(\'/\') def get(): return Div(P(\'Hello World!\'), hx_get="/change") serve() Running the app with python main.py prints out a link to your running app: http://localhost:5001 . Visit that link in your browser and you should see a page with the text “Hello World!”. Congratulations, you’ve just created your first FastHTML app! Adding interactivity is surprisingly easy,

In [3]:
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
class Preprocessor:
    def __init__(self, chunk_size: int, chunk_overlap: int):
        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", " ", ""],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
        )

    def chunk_section(self, section: Dict[str, str]) -> List[Dict[str, str]]:
        chunks = self.text_splitter.create_documents(
            texts=[section["text"]],
            metadatas=[{
                "source": section["source"],
                "previous_section": section.get("previous_section"),
                "next_section": section.get("next_section"),
                "metadata": section.get("metadata")
            }]
        )
        return [{
            "text": chunk.page_content,
            "source": chunk.metadata["source"],
            "previous_section": chunk.metadata["previous_section"],
            "next_section": chunk.metadata["next_section"],
            "metadata": chunk.metadata["metadata"]
        } for chunk in chunks]

In [10]:
if __name__ == "__main__":

    preprocessor = Preprocessor(chunk_size=settings.CHUNK_SIZE, chunk_overlap=settings.CHUNK_OVERLAP)
    chunked_docs_text = []
    for doc in docs_text:
        for section in doc:
            chunked_sections = preprocessor.chunk_section(section)
            chunked_docs_text.extend(chunked_sections)
    
    print(f"Total chunks created: {len(chunked_docs_text)}")

Total chunks created: 534


In [11]:
chunked_docs_text[0:5]

[{'text': 'Installation Since fasthtml is a Python library, you can install it with: pip install python-fasthtml In the near future, we hope to add component libraries that can likewise be installed via pip .',
  'source': 'https://docs.fastht.ml/index.html#installation',
  'previous_section': None,
  'next_section': 'https://docs.fastht.ml/index.html#usage',
  'metadata': {'page_heading': 'FastHTML', 'section_id': 'installation'}},
 {'text': 'Usage For a minimal app, create a file “main.py” as follows: main.py from fasthtml.common import * app,rt = fast_app() @rt(\'/\') def get(): return Div(P(\'Hello World!\'), hx_get="/change") serve() Running the app with python main.py prints out a link to your running app: http://localhost:5001 . Visit that link in your browser and you should see a page with the text “Hello World!”. Congratulations, you’ve just created your first FastHTML app! Adding interactivity is surprisingly easy, thanks to',
  'source': 'https://docs.fastht.ml/index.html#us

In [62]:
from typing import List, Dict
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

In [63]:
# get_embedding_model: Returns an embedding model based on the specified model name.
# OpenAIEmbeddings: Uses OpenAI’s API for embedding.
# HuggingFaceEmbeddings: Uses Hugging Face’s models for embedding.

def get_embedding_model(embedding_model_name, model_kwargs, encode_kwargs):
    if embedding_model_name == "text-embedding-ada-002":
        embedding_model = OpenAIEmbeddings(
            model = embedding_model_name,
            openai_api_base = os.environ["OPENAI_API_BASE"],
            openai_api_key = os.environ["OPENAI_API_KEY"]
        )
    else:
        embedding_model = HuggingFaceEmbeddings(
            model_name = embedding_model_name, # also works with model_path
            model_kwargs = model_kwargs,
            encode_kwargs = encode_kwargs
        )
    return embedding_model

# EmbedChunks: A class to embed chunks using the specified model.
# init: Initializes the embedding model.
# call: Embeds the text in the batch and returns the embeddings along with the original text and source.

class EmbedChunks:
    def __init__(self, model_name):
        self.embedding_model = get_embedding_model(
            embedding_model_name = model_name,
            model_kwargs = {"device": "cuda"},
            encode_kwargs = {"device": "cuda", "batch_size": 100}
        )

    def __call__(self, batch):
        embeddings = self.embedding_model.embed_documents(batch["text"])
        return {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}


class Embedder:
    def __init__(self, model_name: str):
        self.embedding_model = get_embedding_model(
            model_name=model_name,
            model_kwargs={"device": "cuda"},
            encode_kwargs={"device": "cuda", "batch_size": 100}
        )

    def embed_chunks(self, chunks: List[Dict[str, str]]) -> List[Dict[str, str]]:
        texts = [chunk["text"] for chunk in chunks]
        embeddings = self.embedding_model.embed_documents(texts)
        return [
            {"text": chunk["text"], "source": chunk["source"], "embedding": embedding}
            for chunk, embedding in zip(chunks, embeddings)
        ]

In [14]:
if __name__ == "__main__":
    embedder = Embedder(model_name=settings.EMBEDDING_MODEL_NAME)
    embedded_chunks = embedder.embed_chunks(chunked_docs_text)

    print(f"Total embeddings created: {len(embedded_chunks)}")

  from tqdm.autonotebook import tqdm, trange
INFO:datasets:PyTorch version 2.4.1 available.
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: thenlper/gte-base


Total embeddings created: 534


In [64]:
# src/storage/vector_store.py
import os
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [65]:
class VectorStore:
    def __init__(self, embedding_model, persist_directory: Path):
        self.vector_store = Chroma(
            collection_name="fasthtml_docs_db",
            embedding_function=embedding_model,
            persist_directory=os.path.abspath(persist_directory)
        )

    def add_documents(self, documents: List[Dict[str, str]]):
        docs = [
            Document(page_content=doc["text"], metadata={"source": doc["source"]})
            for doc in documents
        ]
        self.vector_store.add_documents(docs)
        logger.info(f"Added {len(docs)} documents to the vector store")

    def similarity_search(self, query: str, k: int = 5) -> List[Dict[str, str]]:
        results = self.vector_store.similarity_search_with_score(query, k=k)
        return [
            {
                "text": doc.page_content,
                "source": doc.metadata.get("source", ""),
                "score": score
            }
            for doc, score in results
        ]

In [73]:
if __name__ == "__main__":
    # Initialize the embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name=settings.EMBEDDING_MODEL_NAME,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"device": "cuda", "batch_size": 100}
    )

    # Initialize the vector store
    vectorStore = VectorStore(embedding_model=embedding_model, persist_directory=settings.VECTOR_STORE_DIR)
    
    # Assuming embedded_chunks is already defined and contains the embedded documents
    vectorStore.add_documents(embedded_chunks)

    # Assuming you have a client instance to interact with the Chroma DB
    client = vectorStore.vector_store._client
    collection = client.get_collection(name="fasthtml_docs_db")

    print(f"Total items in {collection}: {collection.count()}")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: thenlper/gte-base
INFO:chromadb.telemetry.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


Total items in name='fasthtml_docs_db' id=UUID('f5836f62-38aa-40d4-add4-b2b4d063ece3') metadata=None: 534


In [12]:
vectorStore.similarity_search('how to install fasthtml?')

[{'text': 'Installing FastHTML FastHTML is just Python . Installation is often done with pip: pip install python-fasthtml',
  'source': 'https://docs.fastht.ml/tutorials/tutorial_for_web_devs.html#installing-fasthtml',
  'score': 0.1384994089603424},
 {'text': 'Install FastHTML For Mac, Windows and Linux, enter: pip install python-fasthtml',
  'source': 'https://docs.fastht.ml/tutorials/e2e.html#install-fasthtml',
  'score': 0.1573334038257599},
 {'text': 'Installation pip install python-fasthtml',
  'source': 'https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html#installation',
  'score': 0.1606634110212326},
 {'text': 'Installation Since fasthtml is a Python library, you can install it with: pip install python-fasthtml In the near future, we hope to add component libraries that can likewise be installed via pip .',
  'source': 'https://docs.fastht.ml/index.html#installation',
  'score': 0.1731264591217041},
 {'text': 'FastHTML Basics FastHTML is just Python . You can install 

### Generate Response

In [66]:
import openai
import time

from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain_huggingface import HuggingFaceEndpoint

In [67]:
def response_stream(chat_completion, llm):
    if llm.startswith("gpt"):
        for chunk in chat_completion:
            content = chunk.choices[0].delta.content
            if content is not None:
                yield content
    else:
        for chunk in chat_completion:
            yield chunk

def prepare_response(chat_completion, stream, llm):
    if stream:
        return response_stream(chat_completion, llm)
    else:
        if llm.startswith("gpt"):
            return chat_completion.choices[0].message.content
        else:
            return chat_completion


def get_client(llm):
    if llm.startswith("gpt"):
        base_url = os.environ["OPENAI_API_BASE"]
        api_key = os.environ["OPENAI_API_KEY"]
        client = openai.OpenAI(base_url=base_url, api_key=api_key)
    else:
        #base_url = os.environ["HUGGINGFACEHUB_API_BASE"]
        api_key = "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" #os.environ["HUGGINGFACEHUB_API_TOKEN"]
        client = HuggingFaceEndpoint(huggingfacehub_api_token=api_key, repo_id=llm)
    
    return client

In [68]:
def generate_response(
    llm, temperature=0.0, stream=True, 
    system_content="", assistant_content="", user_content="", 
    max_retries=1, retry_interval=60):
    """Generate response from an LLM"""
    retry_count = 0
    client = get_client(llm=llm)
    
    prompt = [("system", system_content), ("assistant", assistant_content), ("user", user_content)]
    messages = [{"role": role, "content": content} for role, content in prompt if content]

    while retry_count <= max_retries:
        try:
            if llm.startswith("gpt"):
                chat_completion = client.chat.completions.create(
                    model=llm,
                    temperature=temperature,
                    stream=stream,
                    messages=messages,
                )
            else:
                chat_completion = client.invoke(
                    repo_id=llm,
                    temperature=temperature,
                    streaming=stream,
                    input=messages,
                )
            return prepare_response(chat_completion, stream=stream, llm=llm)

        except Exception as e:
            print(f'Exception: {e}')
            time.sleep(retry_interval) # default is pre-minute rate limits
            retry_count += 1
    return ""    

In [53]:
context_results = vectorStore.similarity_search('how to install fasthtml?')
context = [item["text"] for item in context_results]
print(context)

['Installing FastHTML FastHTML is just Python . Installation is often done with pip: pip install python-fasthtml', 'Install FastHTML For Mac, Windows and Linux, enter: pip install python-fasthtml', 'Installation pip install python-fasthtml', 'Installation Since fasthtml is a Python library, you can install it with: pip install python-fasthtml In the near future, we hope to add component libraries that can likewise be installed via pip .', 'FastHTML Basics FastHTML is just Python . You can install it with pip install python-fasthtml . Extensions/components built for it can likewise be distributed via PyPI or as simple Python files. The core usage of FastHTML is to define routes, and then to define what to do at each route. This is similar to the FastAPI web framework (in fact we implemented much of the functionality to match the FastAPI usage examples), but where FastAPI focuses on returning JSON data to build APIs, FastHTML']


In [54]:
# Generate response
query = 'how to install fasthtml?'
response = generate_response(
    llm="mistralai/Mistral-7B-Instruct-v0.3",
    temperature=0.1,
    stream=True,
    system_content="Answer the query using the context provided. Be succinct.",
    user_content=f"query: {query}, context: {context}"
)

# Stream response
for content in response:
    print(content, end='', flush=True)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/amrit/.cache/huggingface/token
Login successful


To install FastHTML, use the command `pip install python-fasthtml`. This command works on Mac, Windows, and Linux systems. FastHTML is a Python library, so it can be installed using pip.

In [55]:
import tiktoken


def get_num_tokens(text):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))


def trim(text, max_context_length):
    enc = tiktoken.get_encoding("cl100k_base")
    return enc.decode(enc.encode(text)[:max_context_length])

In [79]:
class QueryAgent:
    def __init__(self, embedding_model_name="thenlper/gte-base",
                 llm="mistralai/Mistral-7B-Instruct-v0.3", temperature=0.1, 
                 max_context_length=4096, system_content="", assistant_content=""
                ):
        # Embedding model
        self.embedding_model = get_embedding_model(
            embedding_model_name = embedding_model_name,
            model_kwargs = {"device": "cuda"},
            encode_kwargs = {"device": "cuda", "batch_size": 100}
        )

        # Context length (restrict input length to 50% of total context length)
        max_context_length = int(0.5*max_context_length)

        # LLM
        self.llm = llm
        self.temperature = temperature
        self.context_length = max_context_length - get_num_tokens(system_content + assistant_content)
        self.system_content = system_content
        self.assistant_content = assistant_content

    def __call__(self, query, num_chunks=5, stream=True):
        # Get sources and context
        # Initialize the vector store
        #vectorStore = VectorStore(embedding_model=embedding_model_name, persist_directory=settings.VECTOR_STORE_DIR)
        context_results = vectorStore.similarity_search(query=query)

        # Generate response
        context = [item["text"] for item in context_results]
        sources = [item["source"] for item in context_results]
        user_content = f"query: {query}, context: {context}"
        answer = generate_response(
            llm=self.llm,
            temperature=self.temperature,
            stream=stream,
            system_content=self.system_content,
            assistant_content=self.assistant_content,
            user_content=trim(user_content, self.context_length))

        # Result
        result = {
            "question": query,
            "sources": sources,
            "answer": answer,
            "llm": self.llm,
        }
        return result

# context_results = vectorStore.similarity_search('how to install fasthtml?')
# context = [item["text"] for item in context_results]

# query = 'how to install fasthtml?'
# response = generate_response(
#     llm="mistralai/Mistral-7B-Instruct-v0.3",
#     temperature=0.1,
#     stream=True,
#     system_content="Answer the query using the context provided. Be succinct.",
#     user_content=f"query: {query}, context: {context}"
# )

In [80]:
import json
from config.config import MAX_CONTEXT_LENGTHS

In [81]:
embedding_model_name = "thenlper/gte-base"
llm = "mistralai/Mistral-7B-Instruct-v0.3"

In [82]:
query = 'how to install fasthtml?'
system_content = "Answer the query using the context provided. Be succinct."
agent = QueryAgent(
    embedding_model_name=embedding_model_name,
    llm=llm,
    max_context_length=MAX_CONTEXT_LENGTHS[llm],
    system_content=system_content)
result = agent(query=query, stream=False)
print(json.dumps(result, indent=2))

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: thenlper/gte-base


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/amrit/.cache/huggingface/token
Login successful
{
  "question": "how to install fasthtml?",
  "sources": [
    "https://docs.fastht.ml/tutorials/tutorial_for_web_devs.html#installing-fasthtml",
    "https://docs.fastht.ml/tutorials/e2e.html#install-fasthtml",
    "https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html#installation",
    "https://docs.fastht.ml/index.html#installation",
    "https://docs.fastht.ml/tutorials/by_example.html#fasthtml-basics"
  ],
  "answer": "\n\nTo install FastHTML, use the command `pip install python-fasthtml`. This command works on Mac, Windows, and Linux systems. FastHTML is a Python library, so it can be installed using pip.",
  "llm": "mistralai/Mist