In [5]:
import logging
import os
import time
from urllib.parse import urlparse
from getpass import getpass

import treq
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from slugify import slugify

In [6]:
DOCS_URL = "https://docs.fastht.ml/"
logger = logging.getLogger(__name__)

os.environ['JINA_API_KEY'] = getpass('Enter your JINA API Key: ')
# "jina_32c6bf4c80324eda8b0af06264877b92wuWhwoeK-I_3kowa5gDXaSydK0_i"
# NOTE: 200 requests per min w/ Jina API key

Enter your JINA API Key:  ········


In [7]:
def urljoin(*args):
  """
  Joins given arguments into an url. Trailing but not leading slashes are
  stripped for each argument.
  """

  return "/".join(map(lambda x: str(x).rstrip('/'), args))

In [8]:
class MarkdownPipeline:

  def create_directory_from_url_with_slug(self, url):
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.strip('/').split('/')
    directory_path = './docs/' + self.collection
    for segment in path_segments[:-1]:
      directory_path = os.path.join(directory_path, segment)
      os.makedirs(directory_path, exist_ok=True)
    filename = slugify(path_segments[-1])
    return os.path.join(directory_path, filename)

  def open_spider(self, spider):
    self.collection = spider.domain.title().replace('.', '')
    os.makedirs(f'./docs/{self.collection}', exist_ok=True)

  async def process_item(self, item, spider):
    response = await treq.get('https://r.jina.ai/' + item.get('url'),
                              headers={
                                  'Content-Type':
                                  'text/plain',
                                  "Authorization":
                                  f"Bearer {os.environ['JINA_API_KEY']}"
                              })

    content = await response.text()
    url = item.get('url')

    directory = self.create_directory_from_url_with_slug(url)

    with open(directory + '.md', 'w') as f:
      f.write(content)

    return item

  def close_spider(self, spider):
    self.client.close()

In [9]:
class PagingIncremental(CrawlSpider):
  name = "docs"
  custom_settings = {
      'DOWNLOAD_DELAY': '0',
      'FEED_EXPORT_ENCODING': 'utf-8',
      'DEPTH_LIMIT': '0',
      'AUTOTHROTTLE_ENABLED': 'True',
      'AUTOTHROTTLE_START_DELAY': '1',
      'AUTOTHROTTLE_MAX_DELAY': '3',
      "AUTOTHROTTLE_TARGET_CONCURRENCY": '1'
  }
  rules = (Rule(LinkExtractor(allow=r""), callback='parse', follow=True), )

  def __init__(self, url, *args, **kwargs):
    super().__init__(*args, **kwargs)
    # Visit all found sublinks
    print(url)
    self.domain = urlparse(url).hostname
    self.domain_name = self.domain.split('.')[1]
    self.allowed_domains = [self.domain]

    self.start_urls = [url]

  def parse(self, response):

    item = {}
    item["url"] = response.url
    time.sleep(.1)
    yield item


def process_docs(url):
  process = CrawlerProcess({
      'USER_AGENT': 'Mozilla/5.0',
      'ITEM_PIPELINES': {
          '__main__.MarkdownPipeline': 1,
      },
  })
    
  process.crawl(PagingIncremental, url=url)
  process.start(stop_after_crawl=True)

In [10]:
if __name__ == "__main__":
  process_docs(DOCS_URL)

2024-08-30 22:03:17 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-08-30 22:03:17 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.7, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0], pyOpenSSL 24.2.1 (OpenSSL 3.3.1 4 Jun 2024), cryptography 43.0.0, Platform Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
2024-08-30 22:03:17 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-08-30 22:03:17 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-08-30 22:03:17 [scrapy.extensions.telnet] INFO: Telnet Password: 8b16eb2e35f438ba
2024-08-30 22:03:18 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetCo

https://docs.fastht.ml/


2024-08-30 22:03:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.fastht.ml/> (referer: None)
2024-08-30 22:03:18 [scrapy.downloadermiddlewares.offsite] DEBUG: Filtered offsite request to 'fastht.ml': <GET https://fastht.ml>
2024-08-30 22:03:18 [scrapy.core.engine] DEBUG: Signal handler scrapy.downloadermiddlewares.offsite.OffsiteMiddleware.request_scheduled dropped request <GET https://fastht.ml> before it reached the scheduler.
2024-08-30 22:03:18 [scrapy.downloadermiddlewares.offsite] DEBUG: Filtered offsite request to 'about.fastht.ml': <GET https://about.fastht.ml>
2024-08-30 22:03:18 [scrapy.core.engine] DEBUG: Signal handler scrapy.downloadermiddlewares.offsite.OffsiteMiddleware.request_scheduled dropped request <GET https://about.fastht.ml> before it reached the scheduler.
2024-08-30 22:03:18 [scrapy.downloadermiddlewares.offsite] DEBUG: Filtered offsite request to 'github.com': <GET https://github.com/answerdotai/fasthtml>
2024-08-30 22:03:18 [scrapy.core.engine]