In [None]:
!pip install aiohttp beautifulsoup4 pandas plotly

In [None]:
import os

# 创建项目目录结构
folders = ['crawler', 'tests', 'tests/fixtures', 'outputs']
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# 创建空文件
files = [
    'crawler/__init__.py',
    'crawler/main.py',
    'crawler/fetcher.py', 
    'crawler/parser.py',
    'crawler/storage.py',
    'crawler/utils.py',
    'crawler/robots.py',
    'tests/__init__.py',
    'tests/test_parser.py',
    'requirements.txt',
    'README.md'
]

for file in files:
    with open(file, 'w') as f:
        f.write('# ' + file + '\n')

In [None]:
# 第一个cell：导入所有依赖
import asyncio
import aiohttp
import aiosqlite
import pandas as pd
import plotly.express as px
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import hashlib
import json
import csv
from datetime import datetime
import nest_asyncio

# 允许在jupyter中运行async
nest_asyncio.apply()

In [None]:
class AsyncCrawler:
    def __init__(self, site, concurrency=5, max_pages=50):
        self.site = site
        self.base_url = "https://quotes.toscrape.com/" if site == "quotes" else "https://books.toscrape.com/"
        self.concurrency = concurrency
        self.max_pages = max_pages
        self.seen_urls = set()
        self.data = []
        
    async def fetch_url(self, url, session):
        try:
            async with session.get(url) as response:
                return await response.text()
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None
    
    async def parse_quotes(self, html, url):
        # 解析逻辑 here
        soup = BeautifulSoup(html, 'html.parser')
        quotes = []
        for quote in soup.select('.quote'):
            text = quote.select_one('.text').get_text()
            author = quote.select_one('.author').get_text()
            tags = [tag.get_text() for tag in quote.select('.tag')]
            quotes.append({
                'text': text,
                'author': author,
                'tags': tags,
                'page_url': url
            })
        return quotes
    
    async def run(self):
        async with aiohttp.ClientSession() as session:
            # 爬取逻辑 here
            tasks = []
            # 这里添加具体的爬取任务
            results = await asyncio.gather(*tasks)
            
        return self.data

In [None]:
# 运行爬虫
async def main():
    crawler = AsyncCrawler('quotes', concurrency=3, max_pages=10)
    data = await crawler.run()
    
    # 保存数据
    df = pd.DataFrame(data)
    df.to_csv('outputs/data.csv', index=False)
    df.to_json('outputs/data.jsonl', orient='records', lines=True)
    
    print(f"爬取完成！共获取 {len(data)} 条数据")
    
    return data

# 执行
data = await main()

In [None]:
# 数据分析
df = pd.DataFrame(data)

# 统计信息
stats = {
    'total_pages': len(df),
    'unique_authors': df['author'].nunique(),
    'total_tags': sum(len(tags) for tags in df['tags'])
}

# 可视化
fig = px.bar(df['author'].value_counts().head(10), 
             title='Top 10 Authors',
             labels={'value': 'Quote Count', 'index': 'Author'})
fig.show()

# 保存统计
with open('outputs/stats.json', 'w') as f:
    json.dump(stats, f, indent=2)

In [None]:
# 方法1: 使用await
result = await some_async_function()

# 方法2: 使用asyncio.run() (在单独的cell中)
result = asyncio.run(some_async_function())

# 方法3: 使用IPython的%autoawait魔法命令
%autoawait asyncio