In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Introduction to iterators
 - iterable is an object that can return an iterator, while an iterator is an object that keeps state and produces the next value when you call next() on it. 
 - We can iterate over a list using a for loop
 - iterable: list, strings, dictionaries, file connections
 - Iterators vs. Iterables
     - Iterable:
         - Examples: List, string, dictionaries, file connections
         - An object with an associated iter() method
         - Applying iter() to an iterable creates an iterator
     - Iterator
         - Produces next value with next()

In [1]:
word = 'Da'
it = iter(word)
next(it)

'D'

In [2]:
next(it)

'a'

In [3]:
next(it)

StopIteration: 

## Iterating at once with * 

In [5]:
word = 'Data'
it = iter(word)
print(*it)

D a t a


In [7]:
# No more values to go through! You cannnot do it agian as there are no more values to iterate through!
print(*it)




## Iterating over dictionaries

In [8]:
pythonistas = {'hugo': 'bowne - anderson', 'francis': 'castro'}
for key, value in pythonistas.items():
    print(key,value)

hugo bowne - anderson
francis castro


## Iterating over file connections

In [None]:
file = open('file.txt')
it =iter(file)
print(next(it))

## Using enumerate()
- enumerate() returns an enumerate object that produces a sequence of tuples, and each of the tuples is an index-value pair.

In [9]:
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
e = enumerate(avengers)
print(type(e))

<class 'enumerate'>


In [12]:
# We can use the fuction list to return this enumerate object into a list of tuples 
    # and print it to see what it contains!

avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
e = enumerate(avengers)
e_list = list(e)
print(e_list)

[(0, 'hawkeye'), (1, 'iron man'), (2, 'thor'), (3, 'quicksilver')]


In [13]:
# Enumerate() and unpack 
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
for index, value in enumerate(avengers):
    print(index, value)

0 hawkeye
1 iron man
2 thor
3 quicksilver


In [14]:
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
for index, value in enumerate(avengers, start = 10):
    print(index, value)

10 hawkeye
11 iron man
12 thor
13 quicksilver


In [22]:
"""
Create a list of tuples from mutants and assign the result to mutant_list. 
Make sure you generate the tuples using enumerate() and turn the result from it into a list using list().
"""

# Create a list of strings: mutants
mutants = ['charles xavier', 
            'bobby drake', 
            'kurt wagner', 
            'max eisenhardt', 
            'kitty pryde']

# Create a list of tuples: mutant_list
mutant_list = list(enumerate(mutants))

# Print the list of tuples
print(mutant_list)


# Unpack and print the tuple pairs
for index1, value1 in enumerate(mutants):
    print(index1, value1)

[(0, 'charles xavier'), (1, 'bobby drake'), (2, 'kurt wagner'), (3, 'max eisenhardt'), (4, 'kitty pryde')]
0 charles xavier
1 bobby drake
2 kurt wagner
3 max eisenhardt
4 kitty pryde


## zip()
 - accepts an arbitrary number of iterables and returns an iterator of tuples

In [15]:
# zipping them together creates a zip object which is an iterator of tuples
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
names = ['barton', 'stark','odinson','maximoff']
z = zip(avengers, names)
print(type(z))

<class 'zip'>


In [16]:
# We can turn this zip object into a list and print the list 
z_list = list(z)
print(z_list)

[('hawkeye', 'barton'), ('iron man', 'stark'), ('thor', 'odinson'), ('quicksilver', 'maximoff')]


## zip() and unpack

In [17]:
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
names = ['barton', 'stark','odinson','maximoff']
for z1, z2 in zip(avengers, names):
    print(z1, z2)

hawkeye barton
iron man stark
thor odinson
quicksilver maximoff


## Print zip with *

In [20]:
avengers = ['hawkeye','iron man', 'thor', 'quicksilver']
names = ['barton', 'stark','odinson','maximoff']
z = zip(avengers, names)
print(*z) # splat operator to print all the elements!

('hawkeye', 'barton') ('iron man', 'stark') ('thor', 'odinson') ('quicksilver', 'maximoff')


## Using iterate to load large files into memory
 - load data in chunks (chuck_size)

In [None]:
# Iterating over data
import pandas as pd 
result = []
for chunk in pd.read_csv('data.csv', chunksize = 1000):
    result.append(sum(chunk['x'])) # x is the intersted column
total = sum(result)
print(total)

In [None]:
# Iterating over data
import pandas as pd 
result = 0
for chunk in pd.read_csv('data.csv', chunksize = 1000):
    total += sum(chunk['x'])
print(total)

In [None]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Iterate over the file chunk by chunk
for chunk in pd.read_csv('tweets.csv', chunksize=10):

    # Iterate over the column in DataFrame
    for entry in chunk['lang']:
        if entry in counts_dict.keys():
            counts_dict[entry] += 1
        else:
            counts_dict[entry] = 1

# Print the populated dictionary
print(counts_dict)

In [None]:
# Define count_entries()
def count_entries(csv_file, c_size, colname):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Iterate over the file chunk by chunk
    for chunk in pd.read_csv(csv_file, chunksize = c_size):

        # Iterate over the column in DataFrame
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1

    # Return counts_dict
    return counts_dict

# Call count_entries(): result_counts
result_counts = count_entries('tweets.csv', 10, 'lang')

# Print result_counts
print(result_counts)

## List comprehension 
 - create list from other lists, DataFrame columns
 - Collapse for loops for building lists into a single line (More efficient than using a for loop)
 - Components
     - Iterable
     - Iterator variable (represent members of iterable)
     - output expression

In [24]:
# Populate a list with a for loop 
nums = [12, 8, 21, 3, 16]
new_nums = []
for num in nums: 
    new_nums.append(num +1)
print(new_nums)

[13, 9, 22, 4, 17]


In [25]:
new_nums = [num + 1 for num in nums]
print(new_nums)

[13, 9, 22, 4, 17]


In [32]:
# Nested loops
pairs_1 = []
for num1 in range(0, 2):
    for num2 in range(6, 8):
        pairs_1.append(num1, num2)
print(pairs_1)

TypeError: append() takes exactly one argument (2 given)

In [34]:
pairs_2 = [(num1, num2) for num1 in range(0,2) for num2 in range(6,8)]
print(pairs_2)

[(0, 6), (0, 7), (1, 6), (1, 7)]


In [None]:
## Using list comprehension to create matrix below: 
"""
 5 x 5 matrix with values 0 to 4 in each row can be written as:

matrix = [[0, 1, 2, 3, 4],
          [0, 1, 2, 3, 4],
          [0, 1, 2, 3, 4],
          [0, 1, 2, 3, 4],
          [0, 1, 2, 3, 4]]
          
1. In the inner list comprehension - that is, the output expression of the nested list comprehension 
    - create a list of values from 0 to 4 using range(). Use col as the iterator variable.
2. In the iterable part of your nested list comprehension, use range() to count 5 rows 
    - that is, create a list of values from 0 to 4. Use row as the iterator variable; 
        note that you won't be needing this variable to create values in the list of lists.
"""

In [35]:
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]

# Print the matrix
for row in matrix:
    print(row)

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


## Conditionals in comprehensions

In [36]:
[num **2 for num in range(10) if num % 2 ==0]

[0, 4, 16, 36, 64]

In [1]:
[num ** 2 if num % 2 == 0 else 0 for num in range(10)]

[0, 0, 4, 0, 16, 0, 36, 0, 64, 0]

## Dict comprehensions

In [2]:
pos_neg = {num: -num for num in range(9)}
print(pos_neg)

{0: 0, 1: -1, 2: -2, 3: -3, 4: -4, 5: -5, 6: -6, 7: -7, 8: -8}


In [3]:
print(type(pos_neg))

<class 'dict'>


## Generator expressions

In [4]:
# List comprehension
[2 * num for num in range(10)] 

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [5]:
# Generator expression
(2 * num for num in range(10)) ## Helps in very large file: it does not yet create the entire list!

<generator object <genexpr> at 0x7f9742b94190>

In [12]:
# Create generator object: result
result = (num for num in range(0,11)) # Equals range(11)

# Print the first 5 values
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))

# Print the rest of the values
for value in result: ## You do not have to specified then you can get the value!
    print(value)

0
1
2
3
4
5
6
7
8
9
10


## Generator functions
- produces generator objects when called
- Defined like a regular function - def
- Yields a sequence of values instead of returning a single value
- Generates a value with yield keyword

In [7]:
# Build a generator function 
## The while loop is true until i equals n and then the generator ceases to yield values

def num_sequence(n):
    """ Generate values from 0 to n."""
    i = 0
    while i < n:
        yield i 
        i += 1

print(num_sequence(5))  

<generator object num_sequence at 0x7f9742bd6040>


In [8]:
result = num_sequence(5)
print(result)

<generator object num_sequence at 0x7f9742bd6200>


In [9]:
for item in result:
    print(item)

0
1
2
3
4


In [11]:
for item in result:
    print(*item)

In [13]:
# Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# Define generator function get_lengths
def get_lengths(input_list):
    """Generator function that yields the
    length of the strings in input_list."""

    # Yield the length of a string
    for person in input_list:
        yield len(person)

# Print the values generated by get_lengths()
for value in get_lengths(lannister):
    print(value)

6
5
5
6
7


## Writing a generator to load data in chunks
- This concept of lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an efficient manner by yielding only chunks of data at a time instead of the whole thing at once.

In [None]:
"""
In this exercise, you will define a generator function read_large_file() 
    that produces a generator object which yields a single line from a file each time next() is called on it. 
The csv file 'world_dev_ind.csv' is in your current directory for your use.

Note that when you open a connection to a file, the resulting file object is already a generator! 
So out in the wild, you won't have to explicitly create generator objects in cases such as this. 
However, for pedagogical reasons, we are having you practice how to do this here with the read_large_file() function. 
"""

# Define read_large_file()
def read_large_file(file_object):
    """A generator function to read a large file lazily."""

    # Loop indefinitely until the end of the file
    while True:

        # Read a line from the file: data
        data = file_object.readline()

        # Break if this is the end of the file
        if not data:
            break

        # Yield the line of data
        yield data

# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Create a generator object for the file: gen_file
    gen_file = read_large_file(file)

    # Print the first three lines of the file
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))

In [None]:
"""
Now let's use your generator function to process the World Bank dataset like you did previously. 
You will process the file line by line, 
    to create a dictionary of the counts of how many times each country appears in a column in the dataset. 
For this exercise, however, you won't process just 1000 rows of data, you'll process the entire dataset!
"""

# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Open a connection to the file
with open('world_dev_ind.csv', 'r') as file:

    # Iterate over the generator from read_large_file()
    for line in read_large_file(file):

        row = line.split(',')
        first_col = row[0]

        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

# Print            
print(counts_dict)


In [None]:
# Code from previous exercise
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
df_urb_pop = next(urb_pop_reader)
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
pops = zip(df_pop_ceb['Total Population'], 
           df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)

# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [
    int(pops_list[index]  *  (pops_list[index+1] * 0.01)) for index in pops_list
]

# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()