In [2]:
import subprocess
import sys
import json
from pathlib import Path

# Create a temporary Python script
script_content = """
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
import asyncio
import json

urls_to_crawl = [
    "https://docs.llamaindex.ai/en/stable/understanding/",
]

config = CrawlerRunConfig(
    cache_mode=CacheMode.BYPASS,
    page_timeout=80000,
    word_count_threshold=50,
)

async def crawl_website():
    data_res = {"data": []}

    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
        results = await crawler.arun_many(urls_to_crawl, config=config)

        for result in results:
            if result.success:
                title = result.metadata.get("title", "")
                if not title and result.markdown:
                    lines = result.markdown.raw_markdown.split("\\n")
                    for line in lines:
                        if line.startswith("#"):
                            title = line.strip("#").strip()
                            break

                data_res["data"].append(
                    {
                        "text": result.markdown.raw_markdown if result.markdown else "",
                        "meta": {"url": result.url, "meta": {"title": title}},
                    }
                )

    return data_res

if __name__ == "__main__":
    data_res = asyncio.run(crawl_website())
    print(json.dumps(data_res))
"""

# Write script to temp file
script_path = Path("temp_crawler.py")
script_path.write_text(script_content)

# Run as subprocess
try:
    result = subprocess.run(
        [sys.executable, str(script_path)], capture_output=True, text=True, timeout=120
    )

    print("STDOUT:")
    print(result.stdout)
    print("\nSTDERR:")
    print(result.stderr)
    print(f"\nReturn code: {result.returncode}")

    if result.returncode == 0 and result.stdout.strip():
        # Try to find JSON in output (skip verbose logs)
        lines = result.stdout.strip().split("\n")
        for line in reversed(lines):  # Start from last line
            try:
                data_res = json.loads(line)
                print("\nParsed data:")
                print(data_res)
                break
            except json.JSONDecodeError:
                continue
    else:
        print(f"Script failed or produced no output")
finally:
    # Clean up
    if script_path.exists():
        script_path.unlink()


STDOUT:
[INIT].... в†’ Crawl4AI 0.7.4 
[FETCH]... в†“ https://docs.llamaindex.ai/en/stable/understanding/               
| вњ“ | вЏ±: 1.23s 
[SCRAPE].. в—† https://docs.llamaindex.ai/en/stable/understanding/               
| вњ“ | вЏ±: 0.23s 
[COMPLETE] в—Џ https://docs.llamaindex.ai/en/stable/understanding/               
| вњ“ | вЏ±: 1.46s 
{"data": [{"text": "[Skip to content](https://developers.llamaindex.ai/python/framework/understanding/#_top)\n[ ![](https://developers.llamaindex.ai/python/_astro/llamaindex-light.BJap_D_H.svg) ![](https://developers.llamaindex.ai/python/_astro/llamaindex-dark.BpGWuI4l.svg) LlamaIndex Python Documentation  ](https://developers.llamaindex.ai/)\nSearch ` `Ctrl``K` `\nCancel \nClear\n[TypeScript](https://developers.llamaindex.ai/typescript/framework/)\n[Twitter](https://x.com/llama_index)[LinkedIn](https://www.linkedin.com/company/llamaindex)[Bluesky](https://bsky.app/profile/llamaindex.bsky.social)[GitHub](https://github.com/run-llama/llama_index/)\