In [None]:
import time

def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])

In [None]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    for url in urls:
        await crawl_page(url)  # 仍然是同步执行

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
CPU times: total: 0 ns
Wall time: 10 s


In [7]:
print(crawl_page)
print(main)

<function crawl_page at 0x00000240C259C0E0>
<function main at 0x00000240C259CA40>


In [14]:
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]  # 通过任务实现异步并发，这里已经开始执行
    for task in tasks:
       await task  # 这里是为了等待任务结束

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: total: 15.6 ms
Wall time: 4.02 s


In [17]:
import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    await asyncio.gather(*tasks)  # 等待所有任务结束，*tasks 解包列表

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
CPU times: total: 0 ns
Wall time: 4.01 s


In [28]:
import asyncio

async def worker1():
    print('work1 start')
    await asyncio.sleep(1)
    print('work1 done')

async def worker2():
    print('work2 start')
    await asyncio.sleep(2)
    print('work2 done')

async def main():
    await worker1()
    print('awaited worker1')
    await worker2()
    print('awaited worker2')

%time asyncio.run(main())

work1 start
work1 done
awaited worker1
work2 start
work2 done
awaited worker2
CPU times: total: 0 ns
Wall time: 3.01 s


In [30]:
import asyncio

async def worker1():
    print('work1 start')
    await asyncio.sleep(1)
    print('work1 done')

async def worker2():
    print('work2 start')
    await asyncio.sleep(2)
    print('work2 done')

async def main():
    task1 = asyncio.create_task(worker1())
    task2 = asyncio.create_task(worker2())
    print('before awaited')
    # await asyncio.gather(task1, task2)
    # print('awaited tasks')
    await task1
    print('awaited task1')
    await task2
    print('awaited task2')

%time asyncio.run(main())

before awaited
work1 start
work2 start
work1 done
awaited task1
work2 done
awaited task2
CPU times: total: 0 ns
Wall time: 2.01 s


In [35]:
import asyncio

async def worker1():
    await asyncio.sleep(1)
    return 1

async def worker2():
    await asyncio.sleep(2)
    return 2 / 0

async def worker3():
    await asyncio.sleep(3)
    return 3

async def main():
    task1 = asyncio.create_task(worker1())
    task2 = asyncio.create_task(worker2())
    task3 = asyncio.create_task(worker3())

    await asyncio.sleep(2)
    task3.cancel()

    res = await asyncio.gather(task1, task2, task3, return_exceptions=True)
    print(res)

%time asyncio.run(main())

[1, ZeroDivisionError('division by zero'), CancelledError('')]
CPU times: total: 0 ns
Wall time: 2.01 s


In [12]:
import asyncio
import random

import nest_asyncio
nest_asyncio.apply()

async def consumer(queue, id):
    while True:
        val = await queue.get()
        print('{} get a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def producer(queue, id):
    for i in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print('{} put a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def main():
    queue = asyncio.Queue()

    consumer1 = asyncio.create_task(consumer(queue, 'consumer1'))
    consumer2 = asyncio.create_task(consumer(queue, 'consumer2'))

    producer1 = asyncio.create_task(producer(queue, 'producer1'))
    producer2 = asyncio.create_task(producer(queue, 'producer2'))

    await asyncio.sleep(10)
    consumer1.cancel()
    consumer2.cancel()

    await asyncio.gather(consumer1, consumer2, producer1, producer2, return_exceptions=True)

%time asyncio.run(main())

producer1 put a val: 2
producer2 put a val: 7
consumer1 get a val: 2
consumer2 get a val: 7
producer1 put a val: 5
producer2 put a val: 10
consumer2 get a val: 5
consumer1 get a val: 10
producer1 put a val: 1
producer2 put a val: 7
consumer1 get a val: 1
consumer2 get a val: 7
producer1 put a val: 7
producer2 put a val: 10
consumer2 get a val: 7
consumer1 get a val: 10
producer1 put a val: 5
producer2 put a val: 7
consumer1 get a val: 5
consumer2 get a val: 7
CPU times: total: 0 ns
Wall time: 10 s


In [8]:
import requests
from bs4 import BeautifulSoup

def main():
    url = 'https://movie.douban.com/cinema/later/beijing/'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept-Language": "zh-CN,zh;q=0.9",
    }
    init_page = requests.get(url, headers=headers).content
    init_soup = BeautifulSoup(init_page, 'lxml')

    all_movies = init_soup.find('div', id='showing-soon')
    for each_movie in all_movies.find_all('div'):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        if len(all_a_tag) < 2:
            continue

        movie_name = all_a_tag[1].text
        url_to_fetch = all_a_tag[1]['href']
        movie_date = all_li_tag[0].text

        response_item = requests.get(url_to_fetch, headers=headers).content
        soup_item = BeautifulSoup(response_item, 'lxml')
        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

%time main()

玛丽和麦克斯 08月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2923412445.jpg
预告片 08月08日 https://img9.doubanio.com/img/trailer/small/2495163856.jpg
疯狂动物城 08月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2923817996.jpg
预告片 08月08日 https://img9.doubanio.com/img/trailer/small/2462634814.jpg
东极岛 08月08日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2922389852.jpg
预告片 08月08日 https://img9.doubanio.com/img/trailer/small/2390325336.jpg
奇遇 08月08日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2920978322.jpg
预告片 08月08日 https://img9.doubanio.com/img/trailer/small/2390325336.jpg
不再退缩 08月10日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2923675268.jpg
预告片 08月10日 https://img9.doubanio.com/img/trailer/small/2461853865.jpg
绑架毛乎乎 08月11日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2923831879.jpg
预告片 08月11日 https://img9.doubanio.com/img/trailer/small/2484329694.jpg
妖怪森林 08月11日 https://img9.doubanio.com/view/photo/s_ratio_

In [14]:
import asyncio
import aiohttp

from bs4 import BeautifulSoup

import nest_asyncio
nest_asyncio.apply()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Accept-Language": "zh-CN,zh;q=0.9",
}

async def fetch_content(url):
    async with aiohttp.ClientSession(
        headers=headers, connector=aiohttp.TCPConnector(ssl=False)
    ) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    url = 'https://movie.douban.com/cinema/later/beijing/'
    init_page = await fetch_content(url)
    init_soup = BeautifulSoup(init_page, 'lxml')

    movie_names, movie_dates, urls_to_fetch = [], [], []

    all_movies = init_soup.find('div', id='showing-soon')
    for each_movie in all_movies.find_all('div'):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        if len(all_a_tag) < 2:
            continue

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[1]['href'])
        movie_dates.append(all_li_tag[0].text)
    
    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'lxml')
        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

%time asyncio.run(main())
        

玛丽和麦克斯 08月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2923412445.jpg
预告片 08月08日 https://img9.doubanio.com/img/trailer/small/2461853865.jpg
疯狂动物城 08月08日 https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2923817996.jpg
预告片 08月08日 https://img3.doubanio.com/img/trailer/small/2463006147.jpg
东极岛 08月08日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2922389852.jpg
预告片 08月08日 https://img1.doubanio.com/img/trailer/small/2460441348.jpg
奇遇 08月08日 https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2920978322.jpg
预告片 08月08日 https://img3.doubanio.com/img/trailer/small/2462894412.jpg
不再退缩 08月10日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2923675268.jpg
预告片 08月10日 https://img3.doubanio.com/img/trailer/small/2460142687.jpg
绑架毛乎乎 08月11日 https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2923831879.jpg
预告片 08月11日 https://img1.doubanio.com/img/trailer/small/2461591569.jpg
妖怪森林 08月11日 https://img9.doubanio.com/view/photo/s_ratio_