In [None]:
# | default_exp _helper

In [None]:
# | export


from pathlib import Path
from typing import *
import logging
from urllib.request import Request, urlopen
from urllib.parse import urlparse, urljoin
from urllib.error import HTTPError
import zipfile
import os
import glob

from bs4 import BeautifulSoup
from langchain.chat_models import ChatOpenAI
from llama_index import (
    LLMPredictor,
    ServiceContext,
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from tempfile import TemporaryDirectory
import shutil

In [None]:
# | export

def get_all_links_from_website(start_url: str, visited: Optional[set] = None) -> Set[str]:
    """Get a set of all links (URLs) found on the given website, starting from the given start URL.
    
    Args:
        start_url: The starting URL of the website.
        visited: Optional. A set of URLs that have already been visited. Defaults to an empty set.

    Returns:
        A set of all links found on the website.
    """
    if visited is None:
        visited = set()
    try:
        req = Request(start_url)
        # nosemgrep: python.lang.security.audit.dynamic-urllib-use-detected.dynamic-urllib-use-detected
        html_page = urlopen(req) # nosec B310
        soup = BeautifulSoup(html_page, "lxml")

        base_url = urlparse(start_url).scheme + '://' + urlparse(start_url).hostname #type: ignore

        links = set()
        for link in soup.find_all('a', href=True):
            url = urljoin(base_url, link['href']).split("#")[0].strip("/")
            if urlparse(url).hostname == urlparse(start_url).hostname:
                links.add(url)

        visited.add(start_url)
        for link in links:
            if link not in visited:
                visited |= get_all_links_from_website(link, visited)
                
    except HTTPError as e:
        logging.warning(f'Unable to parse: {e.url}')
    
    return visited

In [None]:
all_links = get_all_links_from_website("https://fastkafka.airt.ai")
print(f"{len(all_links)=}\n\n")
display(all_links)

assert len(all_links) > 0
assert 'https://fastkafka.airt.ai/docs/CHANGELOG' in all_links
assert 'https://fastkafka.airt.ai/docs' in all_links



len(all_links)=76




{'https://fastkafka.airt.ai',
 'https://fastkafka.airt.ai/docs',
 'https://fastkafka.airt.ai/docs/0.5.0',
 'https://fastkafka.airt.ai/docs/0.5.0/CHANGELOG',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/KafkaEvent',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/encoder/avsc_to_pydantic',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/ApacheKafkaBroker',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/LocalRedpandaBroker',
 'https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/Tester',
 'https://fastkafka.airt.ai/docs/0.5.0/cli/fastkafka',
 'https://fastkafka.airt.ai/docs/0.5.0/cli/run_fastkafka_server_process',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_04_Github_Actions_Workflow',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_05_Lifespan_Handler',
 'https://fastkafka.airt.ai/docs/0.5.0/guides/Guide_06_Benchmarking_FastKafka',
 'https://fastkafka.airt.ai/docs/0.5.0/guide

In [None]:
# | export


def extract_latest_doc_urls(start_url: str, urls: List[str]) -> List[str]:
    """Extract latest documentation URLs from a list of URLs.

    Args:
        start_url: The URL of the documentation homepage.
        urls: A list of documentation URLs to be filtered.

    Returns:
        A new list containing only the latest version of the documentation URLs.
    """
    ret_val = []
    for url in urls:
        parts = url.split(f"{start_url}/docs/")
        if len(parts) == 1:
            ret_val.append(url)
        else:
            identifier = parts[1].split("/")[0]
            if identifier != "next" and not identifier.replace(".", "").isdigit():
                ret_val.append(url)
    return ret_val

In [None]:
fixture_all_links = ['https://fastkafka.airt.ai/docs/next/api/fastkafka/KafkaEvent', 'https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/Tester', 'https://fastkafka.airt.ai/docs', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/ApacheKafkaBroker', 'https://fastkafka.airt.ai/docs/next/guides/Guide_05_Lifespan_Handler', 'https://fastkafka.airt.ai/docs/next/guides/Guide_11_Consumes_Basics', 'https://fastkafka.airt.ai/docs/next/guides/Guide_21_Produces_Basics', 'https://fastkafka.airt.ai/docs/CHANGELOG', 'https://fastkafka.airt.ai/docs/next/guides/Guide_22_Partition_Keys', 'https://fastkafka.airt.ai/docs/next/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/LocalRedpandaBroker', 'https://fastkafka.airt.ai/docs/next/CONTRIBUTING', 'https://fastkafka.airt.ai/docs/api/fastkafka/KafkaEvent', 'https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka', 'https://fastkafka.airt.ai', 'https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka', 'https://fastkafka.airt.ai/docs/next/api/fastkafka/encoder/avsc_to_pydantic', 'https://fastkafka.airt.ai/docs/guides/Guide_05_Lifespan_Handler', 'https://fastkafka.airt.ai/docs/next/cli/fastkafka', 'https://fastkafka.airt.ai/docs/api/fastkafka', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker', 'https://fastkafka.airt.ai/docs/api/fastkafka/encoder/avsc_to_pydantic', 'https://fastkafka.airt.ai/docs/next/guides/Guide_04_Github_Actions_Workflow', 'https://fastkafka.airt.ai/docs/guides/Guide_06_Benchmarking_FastKafka', 'https://fastkafka.airt.ai/docs/next/guides/Guide_06_Benchmarking_FastKafka', 'https://fastkafka.airt.ai/docs/next/CHANGELOG', 'https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics', 'https://fastkafka.airt.ai/docs/next/guides/Guide_30_Using_docker_to_deploy_fastkafka', 'https://fastkafka.airt.ai/docs/next/guides/Guide_31_Using_redpanda_to_test_fastkafka', 'https://fastkafka.airt.ai/docs/guides/Guide_22_Partition_Keys', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/LocalRedpandaBroker', 'https://fastkafka.airt.ai/docs/next/api/fastkafka', 'https://fastkafka.airt.ai/docs/guides/Guide_31_Using_redpanda_to_test_fastkafka', 'https://fastkafka.airt.ai/docs/cli/run_fastkafka_server_process', 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/Tester', 'https://fastkafka.airt.ai/docs/next', 'https://fastkafka.airt.ai/docs/cli/fastkafka', 'https://fastkafka.airt.ai/docs/next/cli/run_fastkafka_server_process', 'https://fastkafka.airt.ai/docs/next/LICENSE']
start_url = "https://fastkafka.airt.ai"
actual = extract_latest_doc_urls(start_url, fixture_all_links)

display(actual)
assert len(actual) == 20
assert 'https://fastkafka.airt.ai/docs/next' not in actual

['https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics',
 'https://fastkafka.airt.ai/docs',
 'https://fastkafka.airt.ai/docs/CHANGELOG',
 'https://fastkafka.airt.ai/docs/api/fastkafka/KafkaEvent',
 'https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka',
 'https://fastkafka.airt.ai',
 'https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_05_Lifespan_Handler',
 'https://fastkafka.airt.ai/docs/api/fastkafka',
 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker',
 'https://fastkafka.airt.ai/docs/api/fastkafka/encoder/avsc_to_pydantic',
 'https://fastkafka.airt.ai/docs/guides/Guide_06_Benchmarking_FastKafka',
 'https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics',
 'https://fastkafka.airt.ai/docs/guides/Guide_22_Partition_Keys',
 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/LocalRedpandaBroker',
 'https://fastkafka.airt.ai/docs

In [None]:
fixture = [
    "https://fastkafka.airt.ai/docs/CHANGELOG",
    "https://fastkafka.airt.ai/docs/next/CHANGELOG",
    "https://fastkafka.airt.ai/docs/0.6.0/CHANGELOG",
    "https://fastkafka.airt.ai/docs/0.5.0/CHANGELOG",
]
start_url = "https://fastkafka.airt.ai"
expected = ["https://fastkafka.airt.ai/docs/CHANGELOG"]
actual = extract_latest_doc_urls(start_url, fixture)
print(actual)

assert actual == expected

['https://fastkafka.airt.ai/docs/CHANGELOG']


In [None]:
fixture = [
    "https://fastkafka.airt.ai/docs/cli/fastkafka",
    "https://fastkafka.airt.ai/docs/next/cli/fastkafka",
    "https://fastkafka.airt.ai/docs/0.6.0/cli/fastkafka",
    "https://fastkafka.airt.ai/docs/0.5.0/cli/fastkafka",
]
start_url = "https://fastkafka.airt.ai"
expected = ["https://fastkafka.airt.ai/docs/cli/fastkafka"]
actual = extract_latest_doc_urls(start_url, fixture)
print(actual)

assert actual == expected

['https://fastkafka.airt.ai/docs/cli/fastkafka']


In [None]:
fixture = [
    "https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker",
    "https://fastkafka.airt.ai/docs/next/api/fastkafka/testing/ApacheKafkaBroker",
    "https://fastkafka.airt.ai/docs/0.6.0/api/fastkafka/testing/ApacheKafkaBroker",
    "https://fastkafka.airt.ai/docs/0.5.0/api/fastkafka/testing/ApacheKafkaBroker",
]
start_url = "https://fastkafka.airt.ai"
expected = ["https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker"]
actual = extract_latest_doc_urls(start_url, fixture)
print(actual)

assert actual == expected

['https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker']


In [None]:
# | export

def get_service_context() -> ServiceContext:
    """Return a service context object initialized with an LLM predictor based on the gpt-3.5-turbo model
    
    Returns:
        A ServiceContext object with an LLMPredictor and a chunk size limit.
    """
    llm_predictor = LLMPredictor(
        llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
    )
    service_context = ServiceContext.from_defaults(
        llm_predictor=llm_predictor, chunk_size_limit=512
    )
    
    return service_context

In [None]:
service_context = get_service_context()

print(service_context)
assert type(service_context) == ServiceContext



ServiceContext(llm_predictor=<llama_index.llm_predictor.base.LLMPredictor object>, prompt_helper=<llama_index.indices.prompt_helper.PromptHelper object>, embed_model=<llama_index.embeddings.openai.OpenAIEmbedding object>, node_parser=<llama_index.node_parser.simple.SimpleNodeParser object>, llama_logger=<llama_index.logger.base.LlamaLogger object>, callback_manager=<llama_index.callbacks.base.CallbackManager object>, chunk_size_limit=512)


In [None]:
# | export

def zip_index_files(data_dir_path: str) -> None:
    """Compresses all JSON index files within a folder into a ZIP archive.

    Args:
        data_dir_path: The path of the folder to be compressed.
    """
    target_path = os.path.join(data_dir_path, 'website_index.zip')
    with zipfile.ZipFile(target_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        file_paths = glob.glob(os.path.join(data_dir_path, '*.json'))
        for file_path in file_paths:
            file_name = os.path.basename(file_path)
            zipf.write(file_path, arcname=file_name)

In [None]:
with TemporaryDirectory() as d:
    data_path = Path(d) / "data"
    shutil.copytree(Path("..") / "data" , data_path)

    zip_index_files(data_path)
    
    actual = [x.name for x in data_path.glob('**/*') if x.is_file()]
    print(actual)
    assert "website_index.zip" in actual

['.DS_Store', 'website_index.zip']


In [None]:
# | export

def unzip_index_files(zip_file_path: str) -> None:
    """Decompresses a ZIP file in the same folder.

    Args:
        zip_file_path: The path of the ZIP file to decompress.
    """
    folder_path = os.path.dirname(zip_file_path)
    with zipfile.ZipFile(zip_file_path, 'r') as zipf:
        zipf.extractall(folder_path)

In [None]:
with TemporaryDirectory() as d:
    data_path = Path(d) / "data"
    data_path.mkdir(parents=True)
    test_files = ["docstore.json", "index_store.json", "vector_store.json"]
    
    for file in test_files:
        filepath = data_path / file
        with filepath.open("w", encoding ="utf-8") as f:
            f.write("dummy data")

    zip_index_files(data_path)
    
    actual = [x.name for x in data_path.glob('**/*') if x.is_file()]
    assert "website_index.zip" in actual, actual
    
    zip_file_path = data_path / "website_index.zip"
    unzip_index_files(zip_file_path)
    
    actual = [x.name for x in data_path.glob('**/*.json') if x.is_file()]
    print(actual)
    assert sorted(actual) == sorted(test_files)

['vector_store.json', 'index_store.json', 'docstore.json']
