# Deep Crawling

In [8]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from IPython.display import Markdown, display, HTML
import pprint

def printmd(content):
    display(Markdown(content))
    
def printhtml(content):
    display(HTML(content))
    
def pretty_print(content):
    pprint.pp(content)
    
sample_url="https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-c1387.html"


## Quick example

Here's a minimal code snippet that implements a basic deep crawl using the `BFSDeepCrawlStrategy`:


In [9]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import (
    FilterChain,
    DomainFilter,
    URLPatternFilter,
    ContentTypeFilter
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer

In [10]:
# Create a sophisticated filter chain
filter_chain = FilterChain([
    # Domain boundaries
    DomainFilter(
        allowed_domains=["loigiaihay.com"],
        # blocked_domains=["old.docs.example.com"]
    ),

    # URL patterns to include
    URLPatternFilter(patterns=["*de-thi-vao-lop-6-mon-toan*"]),

    # Content type filtering
    ContentTypeFilter(allowed_types=["text/html"])
])

# Create a relevance scorer
keyword_scorer = KeywordRelevanceScorer(
    keywords=["đề thi vào lớp 6", "môn Toán"],
    weight=0.7
)

# Set up the configuration
config = CrawlerRunConfig(
    deep_crawl_strategy=BestFirstCrawlingStrategy(
        max_depth=2,
        include_external=False,
        filter_chain=filter_chain,
        url_scorer=keyword_scorer
    ),
    scraping_strategy=LXMLWebScrapingStrategy(),
    stream=True,
    verbose=False
)

# Execute the crawl
results = []
async with AsyncWebCrawler() as crawler:
    async for result in await crawler.arun(sample_url, config=config):
        results.append(result)
        score = result.metadata.get("score", 0)
        depth = result.metadata.get("depth", 0)
        print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")

# Analyze the results
print(f"Crawled {len(results)} high-value pages")
print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")

# Group by depth
depth_counts = {}
for result in results:
    depth = result.metadata.get("depth", 0)
    depth_counts[depth] = depth_counts.get(depth, 0) + 1

print("Pages crawled by depth:")
for depth, count in sorted(depth_counts.items()):
    print(f"  Depth {depth}: {count} pages")


Depth: 0 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-c1387.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-thanh-pho-hai-phong-e40295.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-tinh-ninh-binh-e40029.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-tinh-thanh-hoa-e40016.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-thanh-pho-thu-duc-nam-2025-co-dap-an-a185401.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-truong-amsterdam-2023-co-dap-an-a142600.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-tinh-hung-yen-e40178.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-truong-amsterdam-2020-co-dap-an-a135035.html
Depth: 1 | Score: 0.00 | https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-tinh-phu-tho-e40035.html
Depth: 1 | Score: 0.00 | https://loigiaihay.co

In [11]:
len(set([link['href'] for link in result.links['internal'] if "de-thi-vao-lop-6-mon-toan" in link['href']]))

77