In [6]:
import os
from pathlib import Path
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_fixed


In [9]:


class HTMLDownloader:
    def __init__(self, base_url, output_dir):
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.downloaded_urls = set()
        self.output_dir.mkdir(parents=True, exist_ok=True)

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
    def fetch_url(self, url):
        response = requests.get(url)
        response.raise_for_status()
        return response

    def download_page(self, url):
        if url in self.downloaded_urls:
            return

        try:
            response = self.fetch_url(url)
        except requests.exceptions.RequestException as e:
            print(f'Failed to download: {url} due to {e}')
            return

        self.downloaded_urls.add(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Create directory structure based on URL path
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.strip('/').split('/')
        file_name = path_parts[-1] or 'index.html'
        dir_path = self.output_dir / Path(*path_parts[:-1])
        dir_path.mkdir(parents=True, exist_ok=True)
        file_path = dir_path / file_name
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(soup.prettify())
        print(f'Downloaded: {url}')

        return [
            urljoin(url, link['href'])
            for link in soup.find_all('a', href=True)
            if urljoin(url, link['href']).startswith(self.base_url) and 
               urljoin(url, link['href']).endswith('.html')
        ]

    def download_all(self):
        to_download = [self.base_url]
        with ThreadPoolExecutor(max_workers=10) as executor:
            while to_download:
                futures = [executor.submit(self.download_page, url) for url in to_download]
                to_download = []
                for future in as_completed(futures):
                    result = future.result()
                    if result:
                        to_download.extend(result)

BASE_URL: str = "https://docs.fastht.ml/"
RAW_DATA_DIR: str = "data/raw-data"

if __name__ == "__main__":
    downloader = HTMLDownloader(BASE_URL, RAW_DATA_DIR)
    downloader.download_all()


Downloaded: https://docs.fastht.ml/
Downloaded: https://docs.fastht.ml/index.htmlDownloaded: https://docs.fastht.ml/index.html
Downloaded: https://docs.fastht.ml/explains/faq.html

Downloaded: https://docs.fastht.ml/tutorials/e2e.html
Downloaded: https://docs.fastht.ml/tutorials/index.html
Downloaded: https://docs.fastht.ml/explains/explaining_xt_components.html
Downloaded: https://docs.fastht.ml/index.html
Downloaded: https://docs.fastht.ml/tutorials/tutorial_for_web_devs.html
Downloaded: https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html
Downloaded: https://docs.fastht.ml/ref/defining_xt_component.html
Downloaded: https://docs.fastht.ml/explains/oauth.html
Downloaded: https://docs.fastht.ml/ref/live_reload.html
Downloaded: https://docs.fastht.ml/explains/routes.html
Downloaded: https://docs.fastht.ml/tutorials/by_example.html
Downloaded: https://docs.fastht.ml/api/xtend.html
Downloaded: https://docs.fastht.ml/api/components.html
Downloaded: https://docs.fastht.ml/ref/handl