In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess


class HtmlDocumentCrawler(scrapy.Spider):
    name = 'html_document_crawler'

    def __init__(self, seed_url=None, max_pages=None, max_depth=None, *args, **kwargs):
        super(HtmlDocumentCrawler, self).__init__(*args, **kwargs)
        self.start_urls = [seed_url] if seed_url else []
        self.max_pages = int(max_pages) if max_pages else None
        self.max_depth = int(max_depth) if max_depth else None
        self.pages_crawled = 0

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse, meta={'depth': 0})

    def parse(self, response):
        # Extract HTML content
        html_content = response.body

        # Save the HTML content to a file
        filename = f'page_{response.url.split("/")[-2]}.html'  # Create a filename from the URL
        with open(filename, 'wb') as f:
            f.write(html_content)

        self.pages_crawled += 1

        # Check if the maximum number of pages has been reached
        if self.max_pages is not None and self.pages_crawled >= self.max_pages:
            return

        # Continue crawling if maximum depth criteria is not met
        if self.max_depth is None or response.meta['depth'] < self.max_depth:
            for link in response.css('a::attr(href)').extract():
                yield response.follow(link, callback=self.parse, meta={'depth': response.meta['depth'] + 1})


# Run the crawler within a Jupyter Notebook cell
seed_url = "https://www.history.com"  # Set your seed URL here
max_pages = 10  # Set the maximum number of pages to crawl
max_depth = 1  # Set the maximum depth of the crawling

# Create a CrawlerProcess
process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

# Start the crawler with HtmlDocumentCrawler
process.crawl(HtmlDocumentCrawler, seed_url=seed_url, max_pages=max_pages, max_depth=max_depth)
process.start()


2024-03-21 20:07:45 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2024-03-21 20:07:45 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1q  5 Jul 2022), cryptography 37.0.1, Platform Windows-10-10.0.19045-SP0
2024-03-21 20:07:45 [scrapy.crawler] INFO: Overridden settings:
{'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2024-03-21 20:07:45 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-03-21 20:07:45 [scrapy.extensions.telnet] INFO: Telnet Password: 0229f998230127f0
2024-03-21 20:07:45 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2024-03-21 20:07:45 [scrapy.middleware] INFO: Enabled downloader middlewares:
