# 推导与生成
## 1 列表推导
- 列表推导式代码简洁（适用于没有太多嵌套的简单结构），用列表推导式取代map与filter
- 控制列表推导的逻辑表达式不要超过两个(列表推导不要嵌套超过两层)，否则使用for更直观
- 赋值表达式消除推导中的重复代码（其实赋值表达式作用就是用在赋值+if的场景下的）
注意：[]是列表推导式，()是生成器推导式，不是set
- 不要让函数直接返回列表，应该让它逐个生成
## 2 迭代器
- 谨慎的去迭代函数收到的参数
    - 迭代器或生成器在抛出StopIteration时，不会在第二次迭代，也不会报错，容易查不到错误来源
    - 使用类包裹，并实现__iter__方法去避免iter被多次迭代

## 3 生成器
- 生成器改写数据量大的列表推导
使用(),即生成式推导式来改写数据量大的列表推导式，特别是中间计算结果
- yield from连接多个生成器（生成器的传递，yield from比传统的for来传递快）
后面会发现async/await更好用，哈哈哈
- send可以向生成器发送数据，但是别这么做，不如用迭代器向生成器发数据，更不如使用async函数
- 不要通过throw变换生成器的状态，也就是不要通过throw来传达信息
- 考虑使用itertools来拼装迭代器与生成器
内置还是香的
- 

In [5]:
# 用列表推导式取代map与filter

# 原始for/in结构
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
squares = []
for x in a:
    squares.append(x**2)
print(squares)

# 使用列表推导，更简洁了
squares = [x**2 for x in a]  # List comprehension
print(squares)

# map配合lambda
alt = map(lambda x: x ** 2, a)
# print(list(alt))
assert list(alt) == squares, f'{alt} {squares}'

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
[4, 16, 36, 64, 100]


In [6]:
# 可以对比一下，列表推导式最简洁
even_squares = [x**2 for x in a if x % 2 == 0]
print(even_squares)

alt = map(lambda x: x**2, filter(lambda x: x % 2 == 0, a))
assert even_squares == list(alt)

even_squares_dict = {x: x**2 for x in a if x % 2 == 0}
threes_cubed_set = {x**3 for x in a if x % 3 == 0}
print(even_squares_dict)
print(threes_cubed_set)

alt_dict = dict(map(lambda x: (x, x**2),
				filter(lambda x: x % 2 == 0, a)))
alt_set = set(map(lambda x: x**3,
	          filter(lambda x: x % 3 == 0, a)))
assert even_squares_dict == alt_dict
assert threes_cubed_set == alt_set

[4, 16, 36, 64, 100]
{2: 4, 4: 16, 6: 36, 8: 64, 10: 100}
{216, 729, 27}


In [10]:
# 复杂列表推导，不如使用for简洁
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flat = [x for row in matrix for x in row]
print(flat)

squared = [[x**2 for x in row] for row in matrix]
print(squared)

# 当列表越来越复杂，推导的逻辑越来越复杂，建议放弃使用列表推导，因为代码可读性太差
my_lists = [
    [[1, 2, 3], [4, 5, 6]],
    [[7, 8, 9], [10, 11, 12]],
]
flat = [x for sublist1 in my_lists
        for sublist2 in sublist1
        for x in sublist2]
print(flat)

# 不如for来的直观
flat = []
for sublist1 in my_lists:
    for sublist2 in sublist1:
        flat.extend(sublist2)
print(flat)

[1, 2, 3, 4, 5, 6, 7, 8, 9]
[[1, 4, 9], [16, 25, 36], [49, 64, 81]]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [11]:
# 列表中两个if可以并列，和用and连接效果一样，但是最好不要省略
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
b = [x for x in a if x > 4 if x % 2 == 0]
c = [x for x in a if x > 4 and x % 2 == 0]
print(b)
print(c)
assert b and c
assert b == c

# 列表推导中每个for可以带各自的if
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
filtered = [[x for x in row if x % 3 == 0]
            for row in matrix if sum(row) >= 10]
print(filtered)

[6, 8, 10]
[6, 8, 10]
[[6], [9]]


In [18]:
# 赋值表达式消除推导中的重复代码
stock = {
    'nails': 125,
    'screws': 35,
    'wingnuts': 8,
    'washers': 24,
}

order = ['screws', 'wingnuts', 'clips']

def get_batches(count, size):
    return count // size

# 常规写法
result = {}
for name in order:
  count = stock.get(name, 0)
  batches = get_batches(count, 8)
  if batches:
    result[name] = batches

print(result)

# 使用带赋值表达式的列表推导（列表推导不光能推导列表，字典、集合都可以）
# 但是还是不够简洁，get_batches算了两次
found = {name: get_batches(stock.get(name, 0), 8)
         for name in order
         if get_batches(stock.get(name, 0), 8)}
print(found)

# 在if中定义一个变量类接收get_batches函数的返回值
found = {name: batches for name in order
         if (batches := get_batches(stock.get(name, 0), 8))}
assert found == {'screws': 4, 'wingnuts': 1}, found

{'screws': 4, 'wingnuts': 1}
{'screws': 4, 'wingnuts': 1}


In [17]:
# 一个错误示范，新定义的接收函数返回值的值需要谨慎考虑逻辑先后关系
import logging
try:
    result = {name: (tenth := count // 10)
              for name, count in stock.items() if tenth > 0}
except:
    logging.exception('Expected')
else:
    assert False

AssertionError: 

In [22]:
# 将赋值表达式写在先计算的条件语句中
result = {name: tenth for name, count in stock.items()
          if (tenth := count // 10) > 0}
print(result)

# 如果推导逻辑不带条件，而表示新值又使用了赋值表达式，则赋值表达式左边的值会泄露到推导式外面的作用域里
# 所以建议赋值表达式只用在推导式的条件中
half = [(last := count // 2) for count in stock.values()]
print(f'Last item of {half} is {last}')

# 普通的for循环也是一样会泄露
for count in stock.values():  # Leaks loop variable
    pass
print(f'Last item of {list(stock.values())} is {count}')

# 赋值表达式也可用来写生成器表达式，[]是列表推导式，()是生成器推导式
found = ((name, batches) for name in order
         if (batches := get_batches(stock.get(name, 0), 8)))
print(next(found))
print(next(found))

{'nails': 12, 'screws': 3, 'washers': 2}
Last item of [62, 17, 4, 12] is 12
Last item of [125, 35, 8, 24] is 24
('screws', 4)
('wingnuts', 1)


In [35]:
# 避免永久生成临时文件，运行完直接清理临时文件
# Write all output to a temporary directory
import atexit
import gc
import io
import os
import tempfile

TEST_DIR = tempfile.TemporaryDirectory()
atexit.register(TEST_DIR.cleanup)

# Make sure Windows processes exit cleanly
OLD_CWD = os.getcwd()
atexit.register(lambda: os.chdir(OLD_CWD))
os.chdir(TEST_DIR.name)

def close_open_files():
    everything = gc.get_objects()
    for obj in everything:
        if isinstance(obj, io.IOBase):
            obj.close()

atexit.register(close_open_files)

<function __main__.close_open_files()>

In [26]:
# 函数直接返回结果列表
# 有两个缺点
# - 需要内部关注list细节
# - 如果结果很多，或者是长度任意的输入（流式输入），则内存容易耗尽
def index_words(text):
    result = []
    if text:
        result.append(0)
    for index, letter in enumerate(text):
        if letter == ' ':
            result.append(index + 1)
    return result

address = 'Four score and seven years ago...'
address = 'Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal.'
result = index_words(address)
print(result[:10])

# 使用yield生成器按需生成
def index_words_iter(text):
    if text:
        yield 0
    for index, letter in enumerate(text):
        if letter == ' ':
            yield index + 1

it = index_words_iter(address)
print(next(it))
print(next(it))


# 解决问题二：不用内次都读锁数据，也不用每次计算所有输出
def index_file(handle):
    offset = 0
    for line in handle:
        if line:
            yield offset
        for letter in line:
            offset += 1
            if letter == ' ':
                yield offset

address_lines = """Four score and seven years
ago our fathers brought forth on this
continent a new nation, conceived in liberty,
and dedicated to the proposition that all men
are created equal."""

with open('address.txt', 'w') as f:
    f.write(address_lines)

import itertools
with open('address.txt', 'r') as f:
    it = index_file(f)
    results = itertools.islice(it, 0, 10)
    print(list(results))

[0, 5, 11, 15, 21, 27, 31, 35, 43, 51]
0
5
[0, 5, 11, 15, 21, 27, 31, 35, 43, 51]


In [33]:
# 迭代收到的参数要谨慎

def normalize(numbers):
    total = sum(numbers)
    result = []
    for value in numbers:
        percent = 100 * value / total
        result.append(percent)
    return result

# 正常列表没有问题
visits = [15, 35, 80]
percentages = normalize(visits)
print(percentages)
assert sum(percentages) == 100.0

path = 'my_numbers.txt'
with open(path, 'w') as f:
    for i in (15, 35, 80):
        f.write('%d\n' % i)

def read_visits(data_path):
    with open(data_path) as f:
        for line in f:
            yield int(line)

# 读进来的数据是迭代器，用了sum会导致迭代器迭代完毕，后一个for也是迭代过程，不起作用了
it = read_visits('my_numbers.txt')
percentages = normalize(it)
print(percentages) # 返回[]，因为

# 使用类包装一下，重写__iter__方法
class ReadVisits:
    def __init__(self, data_path):
        self.data_path = data_path

    def __iter__(self):
        with open(self.data_path) as f:
            for line in f:
                yield int(line)

visits = ReadVisits(path)
percentages = normalize(visits)
print(percentages)
assert sum(percentages) == 100.0

# 调用的函数最好检查一下是不是普通的迭代器，如果是则抛异常
# 利用特性：普通迭代器使用iter包裹，会返回本身
def normalize_defensive(numbers):
    if iter(numbers) is numbers:  # An iterator -- bad!
        raise TypeError('Must supply a container')
    total = sum(numbers)
    result = []
    for value in numbers:
        percent = 100 * value / total
        result.append(percent)
    return result

visits = [15, 35, 80]
normalize_defensive(visits)  # No error

it = iter(visits)
try:
    normalize_defensive(it)
except TypeError:
    pass
else:
    assert False

# 使用collections.abc中的Iterator也可以实现上面的功能
from collections.abc import Iterator 

def normalize_defensive(numbers):
    if isinstance(numbers, Iterator):  # Another way to check
        raise TypeError('Must supply a container')
    total = sum(numbers)
    result = []
    for value in numbers:
        percent = 100 * value / total
        result.append(percent)
    return result

visits = [15, 35, 80]
normalize_defensive(visits)  # No error

it = iter(visits)
try:
    normalize_defensive(it)
except TypeError:
    pass
else:
    assert False

visits = [15, 35, 80]
percentages = normalize_defensive(visits)
assert sum(percentages) == 100.0

visits = ReadVisits(path)
percentages = normalize_defensive(visits)
assert sum(percentages) == 100.0

[11.538461538461538, 26.923076923076923, 61.53846153846154]
[]
[11.538461538461538, 26.923076923076923, 61.53846153846154]


In [34]:
# 普通迭代器就会抛异常 
try:
    visits = [15, 35, 80]
    it = iter(visits)
    normalize_defensive(it)
except:
    logging.exception('Expected')
else:
    assert False

ERROR:root:Expected
Traceback (most recent call last):
  File "C:\Users\SAT\AppData\Local\Temp\ipykernel_19684\2871425507.py", line 5, in <module>
    normalize_defensive(it)
  File "C:\Users\SAT\AppData\Local\Temp\ipykernel_19684\3947129963.py", line 75, in normalize_defensive
    raise TypeError('Must supply a container')
TypeError: Must supply a container


In [36]:
# 生成器改写数据量大的列表推导
import random

with open('my_file.txt', 'w') as f:
    for _ in range(10):
        f.write('a' * random.randint(0, 100))
        f.write('\n')

value = [len(x) for x in open('my_file.txt')]
print(value)

it = (len(x) for x in open('my_file.txt'))
print(it)
print(next(it))
print(next(it))

# 推荐使用，特别是中间变量
roots = ((x, x**0.5) for x in it) # 生成器推导式
print(next(roots))

[60, 37, 20, 18, 88, 87, 47, 76, 13, 42]
<generator object <genexpr> at 0x000002580EB20BA0>
60
37
(20, 4.47213595499958)


In [38]:
# yield from连接多个生成器

# yield from比for快
import timeit

def child():
    for i in range(1_000_000):
        yield i

# 正常传递生成器
def slow():
    for i in child():
        yield i

# 使用yield from传递生成器
def fast():
    yield from child()

baseline = timeit.timeit(
    stmt='for _ in slow(): pass',
    globals=globals(),
    number=50)
print(f'Manual nesting {baseline:.2f}s')

comparison = timeit.timeit(
    stmt='for _ in fast(): pass',
    globals=globals(),
    number=50)
print(f'Composed nesting {comparison:.2f}s')

reduction = -(comparison - baseline) / baseline
print(f'{reduction:.1%} less time')

Manual nesting 4.82s
Composed nesting 4.97s
-3.1% less time


In [1]:
# itertools
import itertools

it = itertools.chain([1, 2, 3], [4, 5, 6])
print(list(it))

it = itertools.repeat('hello', 3)
print(list(it))

it = itertools.cycle([1, 2])
result = [next(it) for _ in range (10)]
print(result)

it1, it2, it3 = itertools.tee(['first', 'second'], 3)
print(list(it1))
print(list(it2))
print(list(it3))

keys = ['one', 'two', 'three']
values = [1, 2]
it = itertools.zip_longest(keys, values, fillvalue='nope')
longest = list(it)
print('zip_longest:', longest)

values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
first_five = itertools.islice(values, 5)
print('First five: ', list(first_five))
middle_odds = itertools.islice(values, 2, 8, 2)
print('Middle odds:', list(middle_odds))

values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
less_than_seven = lambda x: x < 7
it = itertools.takewhile(less_than_seven, values)
print(list(it))

values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
less_than_seven = lambda x: x < 7
it = itertools.dropwhile(less_than_seven, values)
print(list(it))


values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
evens = lambda x: x % 2 == 0
filter_false_result = itertools.filterfalse(evens, values)
print('Filter false:', list(filter_false_result))

values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
sum_reduce = itertools.accumulate(values)
print('Sum:   ', list(sum_reduce))
def sum_modulo_20(first, second):
    output = first + second
    return output % 20

modulo_reduce = itertools.accumulate(values, sum_modulo_20)
print('Modulo:', list(modulo_reduce))

single = itertools.product([1, 2], repeat=2)
print('Single:  ', list(single))

multiple = itertools.product([1, 2], ['a', 'b'])
print('Multiple:', list(multiple))

from pprint import pprint
it = itertools.permutations([1, 2, 3, 4], 2)
# original_print = print
# print = pprint
print(list(it))
# print = original_print

it = itertools.combinations([1, 2, 3, 4], 2)
print(list(it))

it = itertools.combinations_with_replacement([1, 2, 3, 4], 2)
# original_print = print
# print = pprint
print(list(it))
# print = original_print

[1, 2, 3, 4, 5, 6]
['hello', 'hello', 'hello']
[1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
['first', 'second']
['first', 'second']
['first', 'second']
zip_longest: [('one', 1), ('two', 2), ('three', 'nope')]
First five:  [1, 2, 3, 4, 5]
Middle odds: [3, 5, 7]
[1, 2, 3, 4, 5, 6]
[7, 8, 9, 10]
Filter false: [1, 3, 5, 7, 9]
Sum:    [1, 3, 6, 10, 15, 21, 28, 36, 45, 55]
Modulo: [1, 3, 6, 10, 15, 1, 8, 16, 5, 15]
Single:   [(1, 1), (1, 2), (2, 1), (2, 2)]
Multiple: [(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')]
[(1, 2), (1, 3), (1, 4), (2, 1), (2, 3), (2, 4), (3, 1), (3, 2), (3, 4), (4, 1), (4, 2), (4, 3)]
[(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
[(1, 1), (1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4), (3, 3), (3, 4), (4, 4)]
