In [None]:
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
import uuid

class Crawler(scrapy.Spider):
    name = "web_crawler"
    
    def __init__(self, seed_url, max_pages, max_depth, save_dir=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [seed_url]
        self.allowed_domains = [urlparse(seed_url).netloc]
        self.max_pages = int(max_pages)
        self.pages_crawled = 0
        self.save_dir = save_dir or 'downloaded_pages'
        self.custom_settings = {
            'DEPTH_LIMIT': int(max_depth),
        }

    def parse(self, response):
        if self.pages_crawled >= self.max_pages:
            return

        os.makedirs(self.save_dir, exist_ok=True)

        filename = f"{uuid.uuid4()}.html"
        filepath = os.path.join(self.save_dir, filename)
        with open(filepath, "wb") as f:
            f.write(response.body)
        print(f"Saved {filename} to folder with {response.url}")
        self.pages_crawled += 1

        if self.pages_crawled >= self.max_pages:
            return
        
        for href in response.css("a::attr(href)").getall():
            if href and not href.startswith("mailto:") and not href.startswith("#"):
                yield response.follow(href, self.parse)
                

crawl_process = CrawlerProcess()
crawl_process.crawl(Crawler, seed_url='https://crawler-test.com', max_pages=10, max_depth=1, save_dir='downloaded_pages')
crawl_process.start()
