In [5]:
import os
import requests
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tenacity import retry, stop_after_attempt, wait_fixed

In [2]:
# Set the output directory
data_dir = 'data/raw-data'
os.makedirs(data_dir, exist_ok=True)

In [20]:
# Set to keep track of downloaded URLs
downloaded_urls = set()

# Retry configuration
@retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
def fetch_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return response

# Function to download a page
def download_page(url, output_dir):
    if url in downloaded_urls:
        return
    downloaded_urls.add(url)
    
    try:
        response = fetch_url(url)
    except requests.exceptions.RequestException as e:
        print(f'Failed to download: {url} due to {e}')
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    # Save the HTML file
    file_name = url.split('/')[-1] or 'index.html'
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    print(f'Downloaded: {url}')
    
    # Find and download all linked pages
    for link in soup.find_all('a', href=True):
        link_url = urljoin(url, link['href'])
        if link_url.startswith(base_url) and link_url.endswith('.html'):
            download_page(link_url, output_dir)

In [21]:
# Base URL
base_url = 'https://docs.fastht.ml/'

# Start downloading from the base URL
download_page(base_url, data_dir)


Downloaded: https://docs.fastht.ml/
Downloaded: https://docs.fastht.ml/index.html
Downloaded: https://docs.fastht.ml/tutorials/index.html
Downloaded: https://docs.fastht.ml/tutorials/by_example.html
Downloaded: https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html
Downloaded: https://docs.fastht.ml/tutorials/e2e.html
Downloaded: https://docs.fastht.ml/tutorials/tutorial_for_web_devs.html
Downloaded: https://docs.fastht.ml/explains/explaining_xt_components.html
Downloaded: https://docs.fastht.ml/explains/faq.html
Downloaded: https://docs.fastht.ml/explains/oauth.html
Downloaded: https://docs.fastht.ml/explains/routes.html
Downloaded: https://docs.fastht.ml/ref/defining_xt_component.html
Downloaded: https://docs.fastht.ml/ref/live_reload.html
Downloaded: https://docs.fastht.ml/api/core.html
Downloaded: https://docs.fastht.ml/api/components.html
Downloaded: https://docs.fastht.ml/api/xtend.html
Downloaded: https://docs.fastht.ml/api/js.html
Downloaded: https://docs.fastht.ml/api/p

#### Using `aiohttp` for Asynchronous Downloads
Using asynchronous requests can significantly speed up the downloading process. The aiohttp library allows you to perform asynchronous HTTP requests. Here’s an example of how you can use it:

In [None]:
import aiohttp
import asyncio

# Set to keep track of downloaded URLs
downloaded_urls = set()

async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()

async def download_page(session, url, output_dir):
    if url in downloaded_urls:
        return
    downloaded_urls.add(url)
    
    try:
        html = await fetch(session, url)
    except Exception as e:
        print(f'Failed to download: {url} due to {e}')
        return
    
    soup = BeautifulSoup(html, 'html.parser')
    # Save the HTML file
    file_name = url.split('/')[-1] or 'index.html'
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())
    print(f'Downloaded: {url}')
    
    # Find and download all linked pages
    tasks = []
    for link in soup.find_all('a', href=True):
        link_url = urljoin(url, link['href'])
        if link_url.startswith(base_url) and link_url.endswith('.html'):
            tasks.append(download_page(session, link_url, output_dir))
    await asyncio.gather(*tasks)

async def main():
    async with aiohttp.ClientSession() as session:
        await download_page(session, base_url, EFS_DIR)

# Run the asynchronous download
asyncio.run(main())

### Create a ChromaDB Collection
Next, we’ll use ChromaDB to store the vector embeddings of the downloaded HTML files.

In [8]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

In [6]:
# Set the directory path
docs_dir = Path(data_dir, "docs.fastht.ml/")

In [3]:
import ray

# Initialize Ray
ray.init()


2024-08-30 16:50:10,466	INFO worker.py:1772 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.9
Ray version:,2.33.0
Dashboard:,http://127.0.0.1:8265


In [8]:
# Create the Ray dataset directly from text files
ds = ray.data.read_text([str(path) for path in Path(data_dir).rglob("*.html")])

# Print the number of documents
print(f"{ds.count()} documents")

2024-08-30 17:06:25,687	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-30_16-50-04_465147_5943/logs/ray-data
2024-08-30 17:06:25,688	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadText]


- ReadText->SplitBlocks(2) 1: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]

28194 documents


In [10]:
first_rows = ds.take(20)
print(first_rows)

2024-08-30 17:21:20,514	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-30_16-50-04_465147_5943/logs/ray-data
2024-08-30 17:21:20,515	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadText] -> LimitOperator[limit=20]


- ReadText->SplitBlocks(2) 1: 0 bundle [00:00, ? bundle/s]

- limit=20 2: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]

[{'text': '<!DOCTYPE html>'}, {'text': '<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">'}, {'text': ' <head>'}, {'text': '  <meta charset="utf-8"/>'}, {'text': '  <meta content="quarto-1.5.56" name="generator"/>'}, {'text': '  <meta content="width=device-width, initial-scale=1.0, user-scalable=yes" name="viewport"/>'}, {'text': '  <meta content="Learn the foundations of FastHTML by creating your own blogging system from scratch." name="description"/>'}, {'text': '  <title>'}, {'text': '   BYO Blog – fasthtml'}, {'text': '  </title>'}, {'text': '  <style>'}, {'text': '   code{white-space: pre-wrap;}'}, {'text': 'span.smallcaps{font-variant: small-caps;}'}, {'text': 'div.columns{display: flex; gap: min(4vw, 1.5em);}'}, {'text': 'div.column{flex: auto; overflow-x: auto;}'}, {'text': 'div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}'}, {'text': 'ul.task-list{list-style: none;}'}, {'text': 'ul.task-list li input[type="checkbox"] {'}, {'text': '  width: 0.8em;

In [11]:
df = ds.to_pandas()

2024-08-30 17:23:52,537	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-30_16-50-04_465147_5943/logs/ray-data
2024-08-30 17:23:52,538	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadText]


- ReadText->SplitBlocks(2) 1: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]

In [12]:
# Shutdown Ray
ray.shutdown()

In [18]:
df[0:100]

Unnamed: 0,text
0,<!DOCTYPE html>
1,"<html lang=""en"" xml:lang=""en"" xmlns=""http://ww..."
2,<head>
3,"<meta charset=""utf-8""/>"
4,"<meta content=""quarto-1.5.56"" name=""generato..."
...,...
95,"""s"""
96,"],"
97,"""show-item-context"": false,"
98,"""language"": {"


In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_website(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract text content
    content = soup.get_text()
    
    # Find all links to other pages
    links = soup.find_all('a', href=True)
    internal_links = [urljoin(base_url, link['href']) for link in links if link['href'].startswith('/') or base_url in link['href']]
    
    return content, internal_links

# Usage
base_url = "https://docs.fastht.ml/"
content, links = scrape_website(base_url, base_url)

In [20]:
content

'\n\n\n\n\n\nFastHTML – fasthtml\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\n\nLearn\n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n\n\n \n\n\n\n\n\n\nGet Started\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGet Started\n\n\n\n\n\nTutorials\n\n\n\n\n\n\n\n\nFastHTML By Example\n\n\n\n\n\nWeb Devs Quickstart\n\n\n\n\n\nJS App Walkthrough\n\n\n\n\n\nBYO Blog\n\n\n\n\n\n\n\nExplanations\n\n\n\n\n\n\n\n\nft Components\n\n\n\n\n\nFAQ\n\n\n\n\n\nOAuth\n\n\n\n\n\nRoutes\n\n\n\n\n\n\n\nReference\n\n\n\n\n\n\n\n\nCustom Components\n\n\n\n\n\nLive Reloading\n\n\n\n\n\n\n\nSource\n\n\n\n\n\n\n\n\nCore\n\n\n\n\n\nComponents\n\n\n\n\n\nComponent extensions\n\n\n\n\n\nJavascript examples\n\n\n\n\n\nPico.css components\n\n\n\n\n\nOAuth\n\n\n\n\n\nCommand Line Tools\n\n\n\n\n\nfastapp\n\n\n\n\n\n\n\n\n\n\n\nOn this page\n\nInstallation\nUsage\nNext Steps\n\nReport an issueOther FormatsCommonMark\n\n\n\n\n\nFastHTML\n\n\n\n    The fastest, most powerful way to create an HTML a

In [None]:
import logging
import os
import time
from urllib.parse import urlparse
from getpass import getpass

import treq
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from slugify import slugify

DOCS_URL = "https://docs.fastht.ml/"
logger = logging.getLogger(__name__)

os.environ['JINA_API_KEY'] = getpass('Enter your JINA API Key: ')

def urljoin(*args):
  """
  Joins given arguments into an url. Trailing but not leading slashes are
  stripped for each argument.
  """

  return "/".join(map(lambda x: str(x).rstrip('/'), args))

class MarkdownPipeline:

  def create_directory_from_url_with_slug(self, url):
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.strip('/').split('/')
    directory_path = './docs/' + self.collection
    for segment in path_segments[:-1]:
      directory_path = os.path.join(directory_path, segment)
      os.makedirs(directory_path, exist_ok=True)
    filename = slugify(path_segments[-1])
    return os.path.join(directory_path, filename)

  def open_spider(self, spider):
    self.collection = spider.domain.title().replace('.', '')
    os.makedirs(f'./docs/{self.collection}', exist_ok=True)

  async def process_item(self, item, spider):
    response = await treq.get('https://r.jina.ai/' + item.get('url'),
                              headers={
                                  'Content-Type':
                                  'text/plain',
                                  "Authorization":
                                  f"Bearer {os.environ['JINA_API_KEY']}"
                              })

    content = await response.text()
    url = item.get('url')

    directory = self.create_directory_from_url_with_slug(url)

    with open(directory + '.md', 'w') as f:
      f.write(content)

    return item

  def close_spider(self, spider):
    self.client.close()


class PagingIncremental(CrawlSpider):
  name = "docs"
  custom_settings = {
      'DOWNLOAD_DELAY': '0',
      'FEED_EXPORT_ENCODING': 'utf-8',
      'DEPTH_LIMIT': '0',
      'AUTOTHROTTLE_ENABLED': 'True',
      'AUTOTHROTTLE_START_DELAY': '1',
      'AUTOTHROTTLE_MAX_DELAY': '3',
      "AUTOTHROTTLE_TARGET_CONCURRENCY": '1'
  }
  rules = (Rule(LinkExtractor(allow=r""), callback='parse', follow=True), )

  def __init__(self, url, *args, **kwargs):
    super().__init__(*args, **kwargs)
    # Visit all found sublinks
    print(url)
    self.domain = urlparse(url).hostname
    self.domain_name = self.domain.split('.')[1]
    self.allowed_domains = [self.domain]

    self.start_urls = [url]

  def parse(self, response):

    item = {}
    item["url"] = response.url
    time.sleep(.1)
    yield item


def process_docs(url):
  process = CrawlerProcess({
      'USER_AGENT': 'Mozilla/5.0',
      'ITEM_PIPELINES': {
          '__main__.MarkdownPipeline': 1,
      },
  })
    
  process.crawl(PagingIncremental, url=url)
  process.start(stop_after_crawl=True)


if __name__ == "__main__":
  process_docs(DOCS_URL)