In [1]:
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import re as regex

from linear_algebra import *
from statistics import *

# Data containers in Python

## Lists

In [4]:
integer_list = [1, 2, 3]
heterogeneous_list = ["string", 0.1, True]
list_of_lists = [ integer_list, heterogeneous_list, [] ]
list_length = len(integer_list) # equals 3
list_sum = sum(integer_list) # equals 6

In Python 3, `range` is a "generator object", and does not return a List anymore (as in Python 2).

You need to convert it in a list:

In [140]:
x = range(10) # is a generator object [0, 1, ..., 9]. MUST BE CONVERTED IN LIST IN PYTHON 3

x = list(x) # is the list [0, 1, ..., 9]

zero = x[0] # equals 0, lists are 0-indexed
one = x[1] # equals 1
nine = x[-1] # equals 9, 'Pythonic' for last element
eight = x[-2] # equals 8, 'Pythonic' for next-to-last element
x[0] = -1 # now x is [-1, 1, 2, 3, ..., 9]

Slicing:

In [14]:
first_three = x[:3] # [-1, 1, 2]
three_to_end = x[3:] # [3, 4, ..., 9]
one_to_four = x[1:5] # [1, 2, 3, 4]
last_three = x[-3:] # [7, 8, 9]
without_first_and_last = x[1:-1] # [1, 2, ..., 8]
copy_of_x = x[:] # [-1, 1, 2, ..., 9]

List membership:

In [16]:
1 in [1, 2, 3] # True

True

Concatenate list:

In [17]:
x = [1, 2, 3]
x.extend([4, 5, 6]) # x is now [1,2,3,4,5,6]
print (x)

y = [1, 2, 3]
y.append(0) # x is now [1, 2, 3, 0]
print (y)

[1, 2, 3, 4, 5, 6]
[1, 2, 3, 0]


## Tuples

Same as lists, but immutable. Tuples are a convenient way to return multiple values from functions. 

In [4]:
my_list = [1, 2]
my_tuple = (3, 4)
other_tuple = 3, 4
print (my_list[0])
print (my_tuple[0])

1
3


In [7]:
my_tuple[0] = 5 #Tuples are immutable. Cannot assign

TypeError: 'tuple' object does not support item assignment

In [35]:
def sum_and_product(x, y):
    return (x + y),(x * y)

print (sum_and_product(3, 3))

(6, 9)


### Enumerate

`Enumerate()` is used to iterate enumerables, making use of both the iterated element and its index. 

In [86]:
text = "Lorem ips"

print (enumerate(text))

for char, index in enumerate(text):
    print (char)
    print (index)

<enumerate object at 0x000001FFF8C025A0>
0
L
1
o
2
r
3
e
4
m
5
 
6
i
7
p
8
s


## Dictionaries

Dictionary keys must be immutable; in particular, you cannot use lists as keys. If
you need a multipart key, you should use a tuple or figure out a way to turn the key
into a string.

In [46]:
empty_dict = {} # Pythonic
#empty_dict2 = dict() # less Pythonic. WORKS ONLY IN PYTHON 2

grades = { "Joel" : 80, "Tim" : 95 } # dictionary literal
print (grades["Joel"])

dictionary = {1: 5, 2: 6}
print (dictionary[1])

80
5


** Dictionary data retrieval**

Dictionary membership using `in`.

Dictionaries have a `get` method that returns a default value (instead of raising an
exception) when you look up a key that’s not in the dictionary

In [52]:
print("Joel" in grades) # True
print("Kate" in grades) # False

print(grades.get("Joel", 0)) # equals 80
print(grades.get("Kate", 0)) # equals 0
print(grades.get("No One")) # default default is None

True
False
80
0
None


**Get all data in dictionary**

You can check for the existence of a key using `in`:

In [55]:
tweet = {
"user" : "joelgrus",
"text" : "Data Science is Awesome",
"retweet_count" : 100,
"hashtags" : ["#data", "#science", "#datascience", "#awesome", "#yolo"]
}

tweet_keys = tweet.keys() # list of keys
tweet_values = tweet.values() # list of values
tweet_items = tweet.items() # list of (key, value) tuples
"user" in tweet_keys # True, but uses a slow list in
"user" in tweet # more Pythonic, uses faster dict in
"joelgrus" in tweet_values # True

True

**Dictionary assigment:**

In [53]:
grades["Tim"] = 99 # replaces the old value
grades["Kate"] = 100 # adds a third entry
num_students = len(grades) # equals 3

### Default dictionary

A
defaultdict is like a regular dictionary, except that when you try to look up a key it
doesn’t contain, it first adds a value for it using a zero-argument function you provided
when you created it. In order to use defaultdicts, you have to import them
from collections

In [76]:
from collections import defaultdict
word_counts = defaultdict(int) # int() produces 0

document = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book."
document = document.split() #document is a list now

for word in document:
    word_counts[word] += 1
    
print (word_counts)

defaultdict(<class 'int'>, {'Lorem': 2, 'Ipsum': 2, 'is': 1, 'simply': 1, 'dummy': 2, 'text': 2, 'of': 2, 'the': 3, 'printing': 1, 'and': 2, 'typesetting': 1, 'industry.': 1, 'has': 1, 'been': 1, "industry's": 1, 'standard': 1, 'ever': 1, 'since': 1, '1500s,': 1, 'when': 1, 'an': 1, 'unknown': 1, 'printer': 1, 'took': 1, 'a': 2, 'galley': 1, 'type': 2, 'scrambled': 1, 'it': 1, 'to': 1, 'make': 1, 'specimen': 1, 'book.': 1})


### Counter

A Counter turns a sequence of values into a defaultdict(int)-like object mapping
keys to counts. Useful to create **histograms**.

The returned defaultdict is already ordered from highest to lowest value.

A Counter instance has a `most_common(int)` method, useful to get a limited view of most frequent occurrencies.

In [116]:
from collections import Counter
c = Counter([0, 1, 2, 0]) # c is (basically) { 0 : 2, 1 : 1, 2 : 1 }

In [117]:
word_counts = Counter(document)
print (word_counts)

Counter({'the': 3, 'Lorem': 2, 'Ipsum': 2, 'dummy': 2, 'text': 2, 'of': 2, 'and': 2, 'a': 2, 'type': 2, 'is': 1, 'simply': 1, 'printing': 1, 'typesetting': 1, 'industry.': 1, 'has': 1, 'been': 1, "industry's": 1, 'standard': 1, 'ever': 1, 'since': 1, '1500s,': 1, 'when': 1, 'an': 1, 'unknown': 1, 'printer': 1, 'took': 1, 'galley': 1, 'scrambled': 1, 'it': 1, 'to': 1, 'make': 1, 'specimen': 1, 'book.': 1})


In [118]:
# print the 5 most common words and their counts
for word, count in word_counts.most_common(5):
    print (word, count)

the 3
Lorem 2
Ipsum 2
dummy 2
text 2


## Sets

Represents a collection of distinct elements. 

`in` is a very fast operation on sets.
If we have a large collection of items that we want to use for a membership test, a set
is more appropriate than a list.

In [119]:
s = set()
s.add(1) # s is now { 1 }
s.add(2) # s is now { 1, 2 }
s.add(2) # s is still { 1, 2 }

x = len(s) # equals 2

print( 2 in s )# equals True
print(3 in s) # equals False

True
False


Convert List in Set:

In [120]:
document = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book."
document = document.split() #document is a list now

words_set = set(document)

print (len(document)) #all the words
print (len(words_set)) #all unique words

43
33


In [121]:
print ("Lorem" in words_set) #very fast check

True


In [122]:
from collections import Counter

print(Counter(words_set))

Counter({'and': 1, 'industry.': 1, '1500s,': 1, 'the': 1, 'since': 1, 'type': 1, 'to': 1, 'Ipsum': 1, 'printer': 1, 'dummy': 1, 'make': 1, 'has': 1, 'of': 1, 'when': 1, 'an': 1, 'specimen': 1, 'simply': 1, 'typesetting': 1, 'scrambled': 1, 'text': 1, 'galley': 1, 'printing': 1, 'unknown': 1, 'is': 1, 'been': 1, 'ever': 1, 'took': 1, 'a': 1, 'Lorem': 1, 'standard': 1, 'it': 1, "industry's": 1, 'book.': 1})


# Data management

## List sorting

Careful for differences btw `List.Sort()` (change in place, returns `None`) and `Sorted(List)` (returns a List).

The `list.sort()` method is only defined for lists. 

In contrast, the `sorted()` function accepts any iterable.

In [123]:
numList = [4,2,1,3]
print (numList.sort()) #returns None
print (numList) #numList has been sorted

numList_reverse = sorted(numList,reverse=True) #returns sorted List
print (numList_reverse)

None
[1, 2, 3, 4]
[4, 3, 2, 1]


Instead of comparing the elements themselves, you can compare the
results of a **function** that you specify with `key`

In [135]:
# sort the list by absolute value from largest to smallest
x = sorted([-4,1,-2,3], key=abs, reverse=True) # is [-4,3,-2,1]
print (x)

[-4, 3, -2, 1]


In [50]:
# ONLY WORKS IN PYTHON 2    
# sort the words and counts from highest count to lowest
wc = sorted(word_counts.items(), 
            key=lambda (word, count): count, 
            reverse=True)


SyntaxError: invalid syntax (<ipython-input-50-a0368f8b8f45>, line 4)

## Dictionary sorting

Standard Dictionary objects are not sorted and so do not guarantee or preserve any ordering.

`sorted()` can be used on them to obtain a **list** of the _ordered keys_ or of the _ordered values_.

**Pretty useless. Use ordered dictionary instead.**

In [131]:
d = {'b': 2, 'a': 3, 'c': 1, 'd': 4}

listOfOrderedKeys = sorted(d)
print(listOfOrderedKeys)

listOfOrderedValues = sorted(d.values())
print(listOfOrderedValues)

['a', 'b', 'c', 'd']
[1, 2, 3, 4]


### Ordered dictionary

Ordered dictionaries are just like regular dictionaries but they remember the order that items were inserted. When iterating over an ordered dictionary, the items are returned in the order their keys were first added.

Since an ordered dictionary remembers its insertion order, it can be used in conjunction with sorting to make a sorted dictionary.

In [132]:
d = {"b":12, "a": 4, "c":2}

from collections import OrderedDict

# dictionary sorted by key
od_byKey = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
print (od_byKey)

# dictionary sorted by value
od_byValue = OrderedDict(sorted(d.items(), key=lambda t: t[1]))
print(od_byValue)

OrderedDict([('a', 4), ('b', 12), ('c', 2)])
OrderedDict([('c', 2), ('a', 4), ('b', 12)])


## List comprehensions

Generate lists, or transform a list into another list, by choosing only certain
elements, or by transforming elements, or both

In [142]:
even_numbers = [x for x in range(5) if x % 2 == 0] # [0, 2, 4]
squares = [x * x for x in range(5)] # [0, 1, 4, 9, 16]
even_squares = [x * x for x in even_numbers] # [0, 4, 16]

print (even_numbers)
print(even_squares)

[0, 2, 4]
[0, 4, 16]


If you don’t need the value from the list, it’s **conventional** to use an **underscore** as the
variable:

In [146]:
zeroes = [0 for _ in even_numbers] # has the same length as even_numbers

print (zeroes)

[0, 0, 0]


### Generate combinations

In [8]:
pairs = [(x, y)
        for x in range(4)
        for y in range(4)] #pairs is a list of tuples

print (pairs)

[(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)]


In [14]:
increasing_pairs = [(x, y) # only pairs with x < y,
                    for x in range(4) # range(lo, hi) equals
                    for y in range(x + 1, 4)] # [lo, lo + 1, ..., hi - 1]
    
print (increasing_pairs)

[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]


### Generate random data/lists

In [18]:
import random
four_uniform_randoms = [random.random() for _ in range(4)]

print (four_uniform_randoms)

[0.5282761314560384, 0.5297452009533989, 0.10210819447884023, 0.8104931650722175]


Random is **deterministic**. Every time **any** `random` method is executed, the seed is changed.

In [49]:
random.seed(10)         # this fixes the seed on 10, so running this cell will always get the same pseudo-random result
print (random.random()) # 0.57140259469
print (random.random()) # 0.4288890

random.seed(10)         # reset the seed to 10. This prevent different random to be generated. 
print (random.random()) # 0.57140259469 again
print (random.random()) # 0.4288890 again

0.5714025946899135
0.4288890546751146
0.5714025946899135
0.4288890546751146


### Pick n' choose (randomly) — generate random lists

In [78]:
random.seed(10) 

x = random.randrange(10) # choose randomly from range(10) = [0, 1, ..., 9]
print (x) #x is 9

y = random.randrange(3, 6) # choose randomly from range(3, 6) = [3, 4, 5]
print(y) #y is 3

z = [random.randrange(3,6) for _ in range(10)] #list comprehension: generate 10 random numbers between 3-5
print(z) #z is [4, 4, 5, 3, 3, 4, 4, 4, 5, 3]

z1 = [random.choice(range(3,6)) for _ in range(10)] # equivalent to the previous, but using "choice"
print (z1)

z_noReplacement = random.sample(range(3,6),2) #choose randomly but with no repetition (no 'replacement')
print (z_noReplacement) #z is [3, 4]

dudes = ["Alice", "Bob", "Charlie", "Gnappo"]
dude = random.choice(dudes)
print (dude) #Charlie


9
3
[4, 4, 5, 3, 3, 4, 4, 4, 5, 3]
[3, 5, 4, 4, 3, 3, 5, 4, 3, 4]
[3, 4]
Gnappo


## Dictionary comprehension

Convert a list into a dictionary with unique keys:

`d = {key: value for (key, value) in iterable}`

In [107]:
square_dict = { x : x * x for x in range(5) }

print (square_dict) # { 0:0, 1:1, 2:4, 3:9, 4:16 }

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}


## Set comprehension

In [105]:
square_set = { x * x for x in [1, -1] }

print (square_set)  # { 1 }

{1}


## Zip and argument unpacking

In Python 3, `Zip()` is a generator object, exactly like `range()`. You need to convert it into a list.

In [94]:
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]

zipGenerator = zip(list1, list2) #unlike Python 2, this now returns a Generator object
zipList = list(zipGenerator) #Convert into List (Python3)

print (zipList) # is [('a', 1), ('b', 2), ('c', 3)]



[('a', 1), ('b', 2), ('c', 3)]


_Unzip_ or _argument unpacking_:

In [97]:
print(list(zip(*zipList)))

[('a', 'b', 'c'), (1, 2, 3)]


In [102]:
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]
list3 = ["alpha","beta","gamma"]

l123 = zip(list1,list2,list3)
print (list(l123))

print(list(zip(*l123))) #doesn't work with 3 lists!! Guess must be tuples

[('a', 1, 'alpha'), ('b', 2, 'beta'), ('c', 3, 'gamma')]
[]


## Regex

In [80]:
import re

print (all([ # all of these are true, because
    not re.match("a", "cat"), # * 'cat' doesn't start with 'a'
    re.search("a", "cat"), # * 'cat' has an 'a' in it
    not re.search("c", "dog"), # * 'dog' doesn't have a 'c' in it
    3 == len(re.split("[ab]", "carbs")), # * split on a or b to ['c','r','s']
    "R-D-" == re.sub("[0-9]", "-", "R2D2") # * replace digits with dashes
])) # prints True

True
