In [1]:
def list_range(start, stop, step=1):
    numbers = []
    while start < stop:
        numbers.append(start)
        start += step
    
    return numbers

In [2]:
def iterator_range(start, stop, step=1):
    while start < stop:
        yield start
        start += step

In [3]:
%timeit list_range(1,10000)

920 µs ± 41.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
%timeit iterator_range(1,10000)

190 ns ± 10.2 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


#### Result
TODO: There was a big difference in processing time between list and iterator.

<hr>

#### Problem
What are the differences between list and generator in terms of memory usage?

In [5]:
# del [list_of_numbers]

In [6]:
list_of_numbers = range(1, 10000000)

In [7]:
# list object
%memit len([n for n in list_of_numbers if n % 3 == 0])

peak memory: 159.46 MiB, increment: 112.68 MiB


In [8]:
#  generator object
%memit sum((1 for n in list_of_numbers if n % 3 == 0))

peak memory: 54.18 MiB, increment: 0.00 MiB


#### Result
The memory impact of the generator version is far less than that of the list comprehension. Please check the value of memory increment.

## Iterators for Infinite Series

In [9]:
def fibonacci():
    i, j = 0, 1
    while True:
        yield j
        i, j = j, i + j

#### Problem
- fibbonacci function
- the number is less than 5000
- tne number is odd

To find the number of integers that satisfy the above conditions, we compare the following three different functions.

In [10]:
def fibonacci_naive():
    i, j = 0, 1
    count = 0
    while j <= 5000:
        if j % 2:
            count += 1
        i, j = j, i + j
    return count

def fibonacci_transform():
    count = 0
    for f in fibonacci():
        if f > 5000:
            break
        count += 1
    return count

from itertools import islice
def fibonacci_succinct():
    is_odd = lambda x: x % 2
    first_5000 = islice(fibonacci(), 0, 5000)
    return sum((1 for elem in first_5000 if is_odd(elem)))

In [11]:
%timeit fibonacci_naive()
%memit fibonacci_naive()

1.77 µs ± 59.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
peak memory: 54.23 MiB, increment: 0.01 MiB


In [12]:
%timeit fibonacci_transform()
%memit fibonacci_transform()

2.12 µs ± 123 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
peak memory: 54.24 MiB, increment: 0.00 MiB


In [13]:
%timeit fibonacci_succinct()
%memit fibonacci_succinct()

3.97 ms ± 170 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
peak memory: 54.24 MiB, increment: 0.00 MiB


#### Result
- All of these methods have similar runtime properties
- fibonacci_transform() is much more verbose than fibonacci_succinct(), but s it will be easy for another developer to debug and understand.
- One of good points in fibonacci_transform() can be separated into two phases: generating data and transforming data.

## Lazy Generator Evaluation

#### Example 5-2. Lazily reading data

In [14]:
from random import normalvariate, random
from itertools import count

# Function to generate a random number every second
def read_fake_data():
    for i in count():
        sigma = random() * 10
        yield (i, normalvariate(0, sigma))

#### Example 5-3. Grouping our data

In [15]:
from datetime import date
from itertools import groupby

# The output is a generator that produces tuples whose items are the key for the group by day
def day_grouper(iterable):
    key = lambda timestamp_value : date.fromtimestamp(timestamp_value[0])
    return groupby(iterable, key)

#### Example 5-4. Generator-based anomaly detection

In [16]:
import math

# anomaly detection
# When a value 6 sigma away from that day’s mean is anomaly.
def check_anomaly(day_grouped_data):
    # We find the mean, standard deviation and maximum values for the day.
    # Using a single pass mean/standard deviation algorithm allows us to only
    # read through the day's data once.
    (day, day_data) = day_grouped_data
    n = 0
    mean = 0
    M2 = 0
    max_value = 0

    for timestamp, value in day_data:
        n += 1
        delta = value - mean
        mean = mean + delta/n
        M2 += delta*(value - mean)
        max_value = max(max_value, value)
    
    variance = M2/(n - 1)
    standard_deviation = math.sqrt(variance)
    
    # Here is the actual check of whether that day's data is anomalous.  If it
    # is, we return the value of the day, otherwise we return false
    if max_value > mean + 6 * standard_deviation:
        return True
    return False

#### Example 5-5. Chaining together our generators

In [17]:
from itertools import filterfalse

data = read_fake_data()

data_day = day_grouper(data)
anomalous_dates = filter(check_anomaly, data_day)

first_anomalous_date, first_anomalous_data = next(anomalous_dates)
print("The first anomalous date is: ", first_anomalous_date)

ten_anomalous_date = islice(anomalous_dates, 0, 10)
print("The next ten anomalous date is: \n")
for anomalous_date in list(ten_anomalous_date):
    print(anomalous_date[0])

The first anomalous date is:  1970-01-02
The next ten anomalous date is: 

1970-01-03
1970-01-04
1970-01-05
1970-01-08
1970-01-09
1970-01-10
1970-01-12
1970-01-13
1970-01-15
1970-01-16


In [18]:
from datetime import datetime

def rolling_window_grouper(data, window_size=3600):
    window = tuple(islice(data, window_size))
    while True:
        current_datetime = datetime.fromtimestamp(window[0][0])
        yield (current_datetime, window)
        window = window[1:] + (next(data),)

In [19]:
data = read_fake_data()

data_day = rolling_window_grouper(data)
anomalous_dates = filter(check_anomaly, data_day)

first_anomalous_date, first_anomalous_data = next(anomalous_dates)
print("The first anomalous date is: ", first_anomalous_date)
second_anomalous_date, second_anomalous_data = next(anomalous_dates)
print("The second anomalous date is: ", second_anomalous_date)

The first anomalous date is:  1970-01-02 08:08:38
The second anomalous date is:  1970-01-02 08:08:41
