<a href="https://colab.research.google.com/github/andresrivera125/colab-books/blob/main/08-Python-data-science-toolbox-%20part-2/08-Python-data-science-toolbox-part-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 01 - Using iterators in PythonLand



## Introduction to iterators

- Iterable

Is an object that implements the method iter().  This method must return an iterator object.

- Iterator
Is an object that keeps state and produces the next value when you call next() on it.

In [None]:
# Create a list of strings: flash
flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen']

# Print each list item in flash using a for loop
for elem in flash:
    print(elem)


# Create an iterator for flash: superhero
superhero = iter(flash)

# Print each item from the iterator
print(next(superhero))
print(next(superhero))
print(next(superhero))
print(next(superhero))

In [None]:
# Create an iterator for range(3): small_value
small_value = iter(range(3))

# Print the values in small_value
print(next(small_value))
print(next(small_value))
print(next(small_value))

# Loop over range(3) and print the values
for elem in range(3):
    print(elem)


# Create an iterator for range(10 ** 100): googol
googol = iter(range(10 ** 100))

# Print the first 5 values from googol
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))

In [None]:
# Create a range object: values
values = range(10,21)

# Print the range object
print(values)

# Create a list of integers: values_list
values_list = list(values)

# Print values_list
print(values_list)

# Get the sum of values: values_sum
values_sum = sum(values)

# Print values_sum
print(values_sum)

## Playing with iterators

In [None]:
# Enumerate
# Create a list of strings: mutants
mutants = ['charles xavier', 
            'bobby drake', 
            'kurt wagner', 
            'max eisenhardt', 
            'kitty pryde']

# Create a list of tuples: mutant_list
mutant_list = list(enumerate(mutants))

# Print the list of tuples
print(mutant_list)

# Unpack and print the tuple pairs
for index1, value1 in enumerate(mutants):
    print(index1, value1)

# Change the start index
for index2, value2 in enumerate(mutants, start=1):
    print(index2, value2)

In [None]:
# Zip
# Create a list of tuples: mutant_data
mutant_data = list(zip(mutants, aliases, powers))

# Print the list of tuples
print(mutant_data)

# Create a zip object using the three lists: mutant_zip
mutant_zip = zip(mutants, aliases, powers)

# Print the zip object
print(mutant_zip)

# Unpack the zip object and print the tuple values
for mutant, alias, power in mutant_zip:
    print(mutant, alias, power)

In [None]:
# Create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# Print the tuples in z1 by unpacking with *
print(z1)

# Re-create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2
result1, result2 = zip(*z1)

# Check if unpacked tuples are equivalent to original tuples
print(result1 == mutants)
print(result2 == powers)

## Using iterators to load large files into memory


In [None]:
# Using chunks to iterate large files
import pandas as pd

result = []

for chunk in pd.read_csv('data.csv', chunksize=1000):
  result.append(sum(chunk['x']))

print(sum(result))

In [None]:
# Other way
import pandas as pd

total = 0

for chunk in pd.read_csv('data.csv', chunksize=1000):
  total += sum(chunk['x'])

print(total)

In [None]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Iterate over the file chunk by chunk
for df in pd.read_csv('tweets.csv', chunksize=10):

    # Iterate over the column in DataFrame
    for entry in df['lang']:
        if entry in counts_dict.keys():
            counts_dict[entry] += 1
        else:
            counts_dict[entry] = 1

# Print the populated dictionary
print(counts_dict)

In [None]:
# Define count_entries()
def count_entries(csv_file, c_size, colname):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Iterate over the file chunk by chunk
    for df in pd.read_csv(csv_file, chunksize=c_size):

        # Iterate over the column in DataFrame
        for entry in df[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1

    # Return counts_dict
    return counts_dict

# Call count_entries(): result_counts
result_counts = count_entries('tweets.csv', 10, 'lang')

# Print result_counts
print(result_counts)

# Chapter 02 - List comprenhensions and generators

## List comprenhensions

In [None]:
# Populate a list with a list a for loop

nums = [12, 8, 21, 3, 16]
new_nums = []

for num in nums:
  new_nums.append(num + 1)

print("New_nums using a for loop")
print(new_nums)

In [1]:
# Populate a list with a list comprehension

nums = [12, 8, 21, 3, 16]
new_nums = [num + 1 for num in nums]

print("New_nums using a list comprehension")
print(new_nums)

New_nums using a list comprehension
[13, 9, 22, 4, 17]


In [4]:
# Neested loops

pairs_1 = []

for num1 in range(0, 2):
  for num2 in range(6, 8):
    pairs_1.append((num1, num2))

print(pairs_1)

[(0, 6), (0, 7), (1, 6), (1, 7)]


In [5]:
# Neested loops using list comprehension

pairs_2 = [(num1, num2) for num1 in range(0, 2) for num2 in range(6, 8)]
print(pairs_2)

[(0, 6), (0, 7), (1, 6), (1, 7)]


In [6]:
# Show the first caracter of each word in a list
doctor = ['house', 'cuddy', 'chase', 'thirteen', 'wilson']
print([doc[0] for doc in doctor])

['h', 'c', 'c', 't', 'w']


In [8]:
# Write a list comprehension that produces a list of 
# the squares of the numbers ranging from 0 to 9

print([num ** 2 for num in range(0, 10)])

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [9]:
# Print a matrix that contains other matrix
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(0,5)]  for row in range (0, 5)]

print([element for element in matrix])

[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]


## Advanced comprehensions

In [10]:
# Conditionals in comprehensions

# Conditionals on the iterable

print([num ** 2 for num in range(10) if num % 2 == 0])

# Conditionals on the output expression

print([num ** 2 if num % 2 == 0 else 0 for num in range(10)])

[0, 4, 16, 36, 64]
[0, 0, 4, 0, 16, 0, 36, 0, 64, 0]


### Dict comprehensions

In [11]:
# Create dictionaries
# Use curly braces {} instead of brackets []

pos_neg = {num: -num for num in range(9)}
print(pos_neg)

{0: 0, 1: -1, 2: -2, 3: -3, 4: -4, 5: -5, 6: -6, 7: -7, 8: -8}


In [12]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship. Choose only the member 
# with a name >= 7
new_fellowship = [member for member in fellowship if len(member) > 6]

# Print the new list
print(new_fellowship)

['samwise', 'aragorn', 'legolas', 'boromir']


In [None]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member if len(member) > 6 else '' for member in fellowship]

# Print the new list
print(new_fellowship)

- Create a dict comprehension where the key is a string in fellowship and the value is the length of the string. Remember to use the syntax <key> : <value> in the output expression part of the comprehension to create the members of the dictionary. Use member as the iterator variable.

In [13]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create dict comprehension: new_fellowship
new_fellowship = {member: len(member) for member in fellowship}

# Print the new dictionary
print(new_fellowship)

{'frodo': 5, 'samwise': 7, 'merry': 5, 'aragorn': 7, 'legolas': 7, 'boromir': 7, 'gimli': 5}


## Introduction to generator expressions

In [14]:
# A list comprehension
list_a = [2 * num for num in range(10)]
print(list_a)
print(type(list_a))

# A generator
# Just use parenthesis () instead of brackets []
generator = (2 * num for num in range(10))
print(generator)
print(type(generator))

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
<class 'list'>
<generator object <genexpr> at 0x7fb7da7246d0>
<class 'generator'>


### List comprehensions vs generators

- List comprehension - returns a list
- Generators - returns a generator object
- Both can be iterated over

In [18]:
# Printing values form generators

result = (num for num in range(6))
print([num for num in result])

[0, 1, 2, 3, 4, 5]


In [20]:
# Lazy evaluation
result = (num for num in range(6))
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))

0
1
2
3
4
5


- The generators don't evaluate all the expression immediately.  That means are great to work with large lists, generate elements of the sequence on the fly.

Let's see and example:

In [None]:
# Using a list comprehension
large_list = [num for num in range(10**1000000)]

In [None]:
# Same range using a generator
large_generator = (num for num in range(10**1000000))

### Generator functions

- Produces generator objects when called
- Defined like a regular function - def
- Yields a sequence of values instead of returning a single value.
- Generates a value with **yield** keyword.

In [22]:
# Build a generator function

def num_sequence(n):
  """Generate values from 0 to n."""

  i = 0
  while i < n:
    yield i
    i += 1

result = num_sequence(5)
print(type(result))
print([num for num in result])

<class 'generator'>
[0, 1, 2, 3, 4]


In [None]:
# Create generator object: result
result = (num for num in range(31))

# Print the first 5 values
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))

# Print the rest of the values
for value in result:
    print(value)

In [None]:
# Create a list of strings: lannister
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# Create a generator object: lengths
lengths = (len(person) for person in lannister)

# Iterate over and print the values in lengths
for value in lengths:
    print(value)


In [None]:
# Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# Define generator function get_lengths
def get_lengths(input_list):
    """Generator function that yields the
    length of the strings in input_list."""

    # Yield the length of a string
    for person in input_list:
        yield len(person)

# Print the values generated by get_lengths()
for value in get_lengths(lannister):
    print(value)


In [None]:
# Extract the created_at column from df: tweet_time
tweet_time = df["created_at"]

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time]

# Print the extracted times
print(tweet_clock_time)


In [None]:
# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']

# Print the extracted times
print(tweet_clock_time)


# Bringing it all together

## Welcome to the case study!

In [24]:
# Zip()
avengers = ["hawkeye", "iron man", "thor", "quicksilver"]
names = ["barton", "stark", "odinson", "maximoff"]

z = zip(avengers, names)
print(type(z))
print(list(z))

<class 'zip'>


TypeError: ignored