# Simple Crawling

In [None]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
from IPython.display import Markdown, display, HTML
import pprint

def printmd(content):
    display(Markdown(content))
    
def printhtml(content):
    display(HTML(content))
    
def pretty_print(content):
    pprint.pp(content)
    
sample_url="https://loigiaihay.com/de-thi-vao-lop-6-mon-toan-truong-cau-giay-nam-2023-a142098.html"


## Basic Usage

In [4]:
browser_config = BrowserConfig()
run_config = CrawlerRunConfig()

async with AsyncWebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(
        url=sample_url,
        config=run_config
    )


In [5]:
printmd(result.markdown[:1000])

[ ![](https://loigiaihay.com/themes/style/images/logolgh.png) ](https://loigiaihay.com/ "loigiaihay.com")
  * [Lớp 12](https://loigiaihay.com/lop-12.html)
    * ![](https://loigiaihay.com/themes/style/images/i-van.png) Ngữ văn 12
      * [Soạn văn - Kết nối tri thức](https://loigiaihay.com/soan-van-12-ket-noi-tri-thuc-c1737.html "Soạn văn - Kết nối tri thức")
      * [Soạn văn - Cánh diều](https://loigiaihay.com/soan-van-12-canh-dieu-c1738.html "Soạn văn - Cánh diều")
      * [Soạn văn - Chân trời sáng tạo](https://loigiaihay.com/soan-van-12-chan-troi-sang-tao-c1739.html "Soạn văn - Chân trời sáng tạo")
      * [Tác giả tác phẩm](https://loigiaihay.com/tac-gia-tac-pham-lop-12-c1851.html "Tác giả tác phẩm")
      * [Tóm tắt, bố cục Văn - Kết nối tri thức](https://loigiaihay.com/tom-tat-bo-cuc-van-12-ket-noi-tri-thuc-c1848.html "Tóm tắt, bố cục Văn - Kết nối tri thức")
      * [Tóm tắt, bố cục Văn - Cánh diều](https://loigiaihay.com/tom-tat-bo-cuc-van-12-canh-dieu-c1849.html "Tóm tắt, bố

In [6]:
pretty_print(result.html[:1000])

('<!DOCTYPE html><html lang="vi"><head><meta http-equiv="origin-trial" '
 'content="A7vZI3v+Gz7JfuRolKNM4Aff6zaGuT7X0mf3wtoZTnKv6497cVMnhy03KDqX7kBz/q/iidW7srW31oQbBt4VhgoAAACUeyJvcmlnaW4iOiJodHRwczovL3d3dy5nb29nbGUuY29tOjQ0MyIsImZlYXR1cmUiOiJEaXNhYmxlVGhpcmRQYXJ0eVN0b3JhZ2VQYXJ0aXRpb25pbmczIiwiZXhwaXJ5IjoxNzU3OTgwODAwLCJpc1N1YmRvbWFpbiI6dHJ1ZSwiaXNUaGlyZFBhcnR5Ijp0cnVlfQ==">\n'
 '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n'
 '<title>Đề thi vào lớp 6 môn Toán trường Cầu Giấy năm 2023 | Đề thi vào lớp 6 '
 'môn Toán</title>\n'
 '<meta name="description" content="Tổng hợp đề thi vào lớp 6 môn Toán trường '
 'Cầu Giấy các năm có đáp án và lời giải chi tiết giúp học sinh ôn thi lớp 5 '
 'lên 6 hiệu quả ">\n'
 '<meta name="keywords" content=",Đề thi vào lớp 6 môn Toán">\n'
 '<meta name="robots" content="index,follow"><meta property="og:site_name" '
 'content="Loigiaihay.com">\n'
 '<meta property="fb:app_id" content="1188256281282988">\n'
 '<meta property="og:ty

In [7]:
pretty_print(result.cleaned_html[:1000])

('\n'
 '\n'
 '\n'
 '<div>\n'
 '<div>\n'
 '<div>\n'
 '<div>\n'
 '<a href="/" title="loigiaihay.com">\n'
 '<img class="header-logo__img" height="58" '
 'src="/themes/style/images/logolgh.png" width="360"/>\n'
 '</a>\n'
 '</div>\n'
 '<nav>\n'
 '<ul>\n'
 '<li>\n'
 '<a href="/lop-12.html">Lớp 12</a>\n'
 '<ul>\n'
 '<div>\n'
 '<li>\n'
 '<a title="Ngữ văn 12"><img src="/themes/style/images/i-van.png"/> <span>Ngữ '
 'văn 12</span></a>\n'
 '<ul>\n'
 '<li><a href="/soan-van-12-ket-noi-tri-thuc-c1737.html" title="Soạn văn - Kết '
 'nối tri thức">Soạn văn - Kết nối tri thức</a></li><li><a '
 'href="/soan-van-12-canh-dieu-c1738.html" title="Soạn văn - Cánh diều">Soạn '
 'văn - Cánh diều</a></li><li><a '
 'href="/soan-van-12-chan-troi-sang-tao-c1739.html" title="Soạn văn - Chân '
 'trời sáng tạo">Soạn văn - Chân trời sáng tạo</a></li><li><a '
 'href="/tac-gia-tac-pham-lop-12-c1851.html" title="Tác giả tác phẩm">Tác giả '
 'tác phẩm</a></li><li><a '
 'href="/tom-tat-bo-cuc-van-12-ket-noi-tri-thuc-c184

In [8]:
pretty_print(result.markdown.fit_markdown)

''


In [9]:
pretty_print(result.markdown.raw_markdown[:100])

('[ ![](https://loigiaihay.com/themes/style/images/logolgh.png) '
 '](https://loigiaihay.com/ "loigiaihay.')


In [10]:
printmd(result.markdown.raw_markdown[:100])

[ ![](https://loigiaihay.com/themes/style/images/logolgh.png) ](https://loigiaihay.com/ "loigiaihay.

In [11]:
len(result.markdown.raw_markdown)

181729

## Adding basic option

In [12]:
run_config = CrawlerRunConfig(
    word_count_threshold=10,        # Minimum words per content block
    exclude_external_links=True,    # Remove external links
    remove_overlay_elements=True,   # Remove popups/modals
    process_iframes=True           # Process iframe content
)

async with AsyncWebCrawler(config=browser_config) as crawler:
    result2 = await crawler.arun(
        url=sample_url,
        config=run_config
    )

In [13]:
len(result.markdown.raw_markdown), len(result2.markdown.raw_markdown)

(181729, 178655)

In [14]:
result.media['images'][0]['src']

'https://img.loigiaihay.com/picture/2023/0728/20_15.png'

In [15]:
result.media['images'][0]['src'] in result.markdown.raw_markdown

True

In [16]:
result2.media['images'][0]['src']

'https://img.loigiaihay.com/picture/2023/0728/20_15.png'

In [17]:
result.links.keys(), result2.links.keys()

(dict_keys(['internal', 'external']), dict_keys(['internal', 'external']))

In [18]:
result.links['external'][0]

{'href': 'javascript:void(0)',
 'text': '×',
 'title': 'Xóa nội dung trong hộp tìm kiếm',
 'base_domain': ''}

In [19]:
result2.links['external']

[]

In [20]:
len(result2.links['internal'])

939

In [25]:
# [link for link in result2.links['internal']]

In [26]:
len(set([link['href'] for link in result2.links['internal'] if "de-thi-vao-lop-6-mon-toan" in link['href']]))

80

## Complete example

In [21]:
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode

async def main():
    browser_config = BrowserConfig(verbose=True)
    run_config = CrawlerRunConfig(
        # Content filtering
        word_count_threshold=10,
        excluded_tags=['form', 'header'],
        exclude_external_links=True,

        # Content processing
        process_iframes=True,
        remove_overlay_elements=True,

        # Cache control
        cache_mode=CacheMode.ENABLED  # Use cache if available
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url="https://example.com",
            config=run_config
        )

        if result.success:
            # Print clean content
            print("Content:", result.markdown[:500])  # First 500 chars

            # Process images
            for image in result.media["images"]:
                print(f"Found image: {image['src']}")

            # Process links
            for link in result.links["internal"]:
                print(f"Internal link: {link['href']}")

        else:
            print(f"Crawl failed: {result.error_message}")


In [22]:
await main()

Content: # Example Domain
This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.

