迭代器

In [1]:
# python 中一切皆对象，对象的抽象是类，对象的集合就是容器
# 列表，元组，字典，集合 所有的容器都是可迭代的（iterable）

# 迭代器（iterator）提供了一个next的方法。调用这个方法后，你要么得到这个容器的下一个对象，要不得到一个stopiteration的错误

# 可迭代对象，通过iter()函数返回一个迭代器，再通过next()函数就可以实现遍历。

In [10]:
# 判断一个对象是否可迭代
from collections import Iterable
params = [
    1234,
    '1234',
    [1, 2, 3, 4],
    set([1, 2, 3, 4]),
    {1:1, 2:2, 3:3, 4:4},
    (1, 2, 3, 4)
]
for param in params:
    print('{} is iterable? {}'.format(param, isinstance(param, Iterable)))

1234 is iterable? False
1234 is iterable? True
[1, 2, 3, 4] is iterable? True
{1, 2, 3, 4} is iterable? True
{1: 1, 2: 2, 3: 3, 4: 4} is iterable? True
(1, 2, 3, 4) is iterable? True


生成器

In [11]:
# 生成器是懒人版本的迭代器
# 再调用next()函数的时候，才会生成下一个变量

# 不会占用大量内存，只有再被使用的时候才会调用
# 初始化时，不需要运行一次生成操作，耗时短
# initailize a generator
(i for i in range(1000000))

<generator object <genexpr> at 0x000002625DFF17C8>

In [12]:
# 验证数学恒等式 (1 + 2 + 3 + ... + n)^2 = 1^3 + 2^3 + 3^3 +...+ n^3
def generator(k):
    i = 1
    while True:
        yield i ** k
        i += 1
gen_1 = generator(1)
gen_3 = generator(3)

print(gen_1)
print(gen_3)

def get_sum(n):
    sum_1, sum_3 = 0, 0
    for i in range(n):
        next_1 = next(gen_1)
        next_3 = next(gen_3)
        print('next_1 = {}, next_3 = {}'.format(next_1, next_3))
        sum_1 += next_1
        sum_3 += next_3
    print(sum_1 * sum_1, sum_3) 

get_sum(8)

<generator object generator at 0x000002625E148848>
<generator object generator at 0x000002625E58B2C8>
next_1 = 1, next_3 = 1
next_1 = 2, next_3 = 8
next_1 = 3, next_3 = 27
next_1 = 4, next_3 = 64
next_1 = 5, next_3 = 125
next_1 = 6, next_3 = 216
next_1 = 7, next_3 = 343
next_1 = 8, next_3 = 512
1296 1296


In [14]:
def index_generator(L, target):
    for i, num in enumerate(L):
        if num == target:
            yield i

print(list(index_generator([1, 6, 2, 4, 5, 2, 8, 6, 3, 2], 2)))

[2, 5, 9]


In [18]:
# 给定两个序列，判定第一个是不是第二个的子序列
def is_subsequence(a, b):
    b = iter(b)
    return all(i in b for i in a)

print(is_subsequence([1, 3, 5], [1, 2, 3, 4, 5]))
print(is_subsequence([1, 4, 3], [1, 2, 3, 4, 5]))

True
False


In [19]:
b = (i for i in range(5))
print(2 in b)
print(4 in b)
print(3 in b)

True
True
False


In [23]:
# all() 判断一个迭代器的元素是否全部为True
print(all([True, False, True]))
print(all([True, True, True]))

False
True


协程：实现并发的一种方式

In [24]:
# 爬虫例子

# 顺序执行 每个页面分别用了1秒到4秒，一共用了10秒
import time

def crawl_page(url):
    print('crawing {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])

crawing url_1
OK url_1
crawing url_2
OK url_2
crawing url_3
OK url_3
crawing url_4
OK url_4
Wall time: 10 s


In [32]:
# 使用协程
import asyncio

async def crawl_page_async(url):
    print('crawing {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main_2(urls):
    for url in urls:
        await crawl_page_async(url)

%time main_2(['url_1', 'url_2', 'url_3', 'url_4'])

Wall time: 0 ns


<coroutine object main at 0x000002625EF5F5C8>

In [35]:
# 协程的重要概念 Task
import asyncio

async def crawl_page_async_3(url):
    print('crawing {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main_3(urls):
    tasks = [asyncio.create_task(crawl_page_async_3(url)) for url in urls]
    for task in tasks:
        await task

%time main_3(['url_1', 'url_2', 'url_3', 'url_4'])

Wall time: 0 ns


<coroutine object main_3 at 0x000002625BCB9DC8>

In [36]:
# task 的另一种做法
# *tasks 解包列表，将列表变成了函数的参数
import asyncio

async def main_4(urls):
    tasks = [asyncio.create_task(crawl_page_async(url)) for url in urls]
    await asyncio.gather(*tasks)

%time main_4(['url_1', 'url_2', 'url_3', 'url_4'])

Wall time: 0 ns


<coroutine object main_4 at 0x000002625E1CC2C8>

In [37]:
# 用协程实现经典的生产者消费者模型
import asyncio
import random

async def consumer(queue, id):
    while True:
        val = await queue.get()
        print('{} get a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def producer(queue, id):
    for i in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print('{} put a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def main():
    queue = asyncio.Queue()

    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))

    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))

    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()

    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2)

%time main()

Wall time: 0 ns


<coroutine object main at 0x000002625F4C5348>

In [1]:
# 并发：通过线程和任务之间互相切换的方式实现，但同一时刻，只允许有一个线程或任务执行。
# 通常应用于I/O操作频繁的场景，比如从网上下载多个文件，I/O操作的时间可能会比CPU运行处理的时间长的多

# 并行：多个进程完全同步同时的执行
# 更多应用于CPU heavy的场景，比如MapReduce中的并行计算，为了加快运行速度，一般会用多台机器、多个处理器来完成

# Python 中之所以同一时刻只允许一个线程运行，其实是由于全局解释器锁的存在。但是对 I/O 操作而言，当其被 block 的时候，全局解释器锁便会被释放，使其他线程继续执行。

In [4]:
import concurrent.futures
import requests
import threading
import time

def download_one(url):
    resp = requests.get(url)
    print('Read {} from {}'.format(len(resp.content), url))

def download_all(sites):
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(download_one, sites)

def main():
    sites = [
        'https://en.wikipedia.org/wiki/Portal:Arts',
        'https://en.wikipedia.org/wiki/Portal:Arts',
        'https://en.wikipedia.org/wiki/Portal:History',
        'https://en.wikipedia.org/wiki/Portal:Society',
        'https://en.wikipedia.org/wiki/Portal:Biography',
        'https://en.wikipedia.org/wiki/Portal:Mathematics',
        'https://en.wikipedia.org/wiki/Portal:Technology',
        'https://en.wikipedia.org/wiki/Portal:Geography',
        'https://en.wikipedia.org/wiki/Portal:Science',
        'https://en.wikipedia.org/wiki/Computer_science',
        'https://en.wikipedia.org/wiki/Python_(programming_language)',
        'https://en.wikipedia.org/wiki/Java_(programming_language)',
        'https://en.wikipedia.org/wiki/PHP',
        'https://en.wikipedia.org/wiki/Node.js',
        'https://en.wikipedia.org/wiki/The_C_Programming_Language',
        'https://en.wikipedia.org/wiki/Go_(programming_language)'
    ]
    start_time = time.perf_counter()
    download_all(sites)
    end_time = time.perf_counter()
    print('Download {} sites in {} seconds'.format(len(sites), end_time -           start_time))

main()


Download 16 sites in 84.19856350000009 seconds
