# Python for Data Analysis - Workbook

### Preliminaries

In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('../Memorial Day DS Project/us_only_v1.0.csv')

## Basics: Built-in Data Structures, Functions, and Files

### Data Structures and sequences

#### Tuples

In [8]:
# a tup is a fixed length, immutable sequence of python obj's. They can be created like this:

tup = 4,5,6

In [9]:
# This one is nested
nested_tup = (4,5,6),(7,8)

In [11]:
# Any sequence or iterator (more on that later) can be converted to a tuple like this:

tuple([4,4,2])
tuple('string')

('s', 't', 'r', 'i', 'n', 'g')

In [12]:
# Just like any C, C++, or aa array, tuples can be accessed with thte [] notation
tup[0]

4

In [15]:
# You can concatenate tuples using the '+' symbol to produce longer tuples
tup + nested_tup

(4, 5, 6, (4, 5, 6), (7, 8))

In [16]:
# But using the '*' character ('*' means multiply) instead of the '+' character has the
# effect of creating that many copies of the tuple

tup * 4

(4, 5, 6, 4, 5, 6, 4, 5, 6, 4, 5, 6)

In [17]:
# Replicate a swap function as in e.g. C

tup = [4,5,6]
a, b = 1, 2

In [21]:
# Common use for variable unpacking is iterating over sequences or lists

seq = [1,2,3], [4,5,6], [7,8,9]

for a,b,c in seq:
    print('a={0} b={1}, c={2}'.format(a,b,c))

a=1 b=2, c=3
a=4 b=5, c=6
a=7 b=8, c=9


In [22]:
# Count the number of occurences of a particular value in a list

a = (1,2,2,2,3,4,2)
a.count(2)

4

#### Lists

In [23]:
# List is variable length and contents can be modified in-place
# Defined using square brackets
# Equivalent to Javascript Array

In [24]:
# List function is commonly used in data processing as a way to create an iterator expression

gen = range(10)

In [25]:
gen

range(0, 10)

In [27]:
# Now we have a list from 0 - 9 (10 values)

list(gen)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [34]:
ex_list = ['foo', 'bar', 'baz']

In [35]:
# Can append or insert elements
ex_list.append('bow')

In [36]:
# Note: inserting/popping elements in a list is computationally expensive
ex_list.insert(0, 'dog')
ex_list

['dog', 'foo', 'bar', 'baz', 'bow']

In [37]:
ex_list.pop(0)
ex_list

['foo', 'bar', 'baz', 'bow']

In [38]:
# Or remove by value
ex_list.remove('baz')
ex_list

['foo', 'bar', 'bow']

In [42]:
# Check to see if a value is included in a specific list
'foo' in ex_list

True

In [43]:
'foo' not in ex_list

False

In [45]:
# Concatenation (expensive)

ex_list2 = ['dog', 'cat', 'mouse']
ex_list + ex_list2

['foo', 'bar', 'bow', 'dog', 'cat', 'mouse']

In [47]:
# Extension (cheaper)

ex_list.extend(ex_list2)
ex_list

['foo', 'bar', 'bow', 'dog', 'cat', 'mouse', 'dog', 'cat', 'mouse']

In [49]:
# Sorting lists

a = [3,4,1]
a.sort()
a

[1, 3, 4]

In [51]:
# Can pass in different keys, or functions that produce a value to search the objects

ex_list.sort(key=len)
ex_list

['foo', 'bar', 'bow', 'dog', 'cat', 'dog', 'cat', 'mouse', 'mouse']

##### Binary Search

In [62]:
# Binary search is implemented through the native "bisect" module

import bisect
c = [1,2,2,2,3,4,7]

In [63]:
bisect.bisect(c,2)

4

In [64]:
bisect.bisect(c,7)

7

In [65]:
# Inserting an element
# Note...list has to be sorted first!

bisect.insort(c,6)
c

[1, 2, 2, 2, 3, 4, 6, 7]

##### Slicing a List

In [66]:
# Slicing - basic form is start:stop to the indexing operator

seq = [7,2,3,7,5,6,0,1]
seq[1:5]

[2, 3, 7, 5]

In [67]:
# Can also assign with a sequence

seq[3:4] = [7,8]
seq

[7, 2, 3, 7, 8, 5, 6, 0, 1]

In [68]:
# Use negative indices to slice the sequence relative to the end

seq[-4:]

[5, 6, 0, 1]

In [69]:
seq[4:]

[8, 5, 6, 0, 1]

In [72]:
# Step size can be defined after a second colon, as below
seq[1:8:2]

[2, 7, 5, 0]

In [73]:
# Here's a handy way to reverse a list

seq[::-1]

[1, 0, 6, 5, 8, 7, 3, 2, 7]

##### Built in Sequence Functions

In [79]:
# Keep track of index of current item when enumerating over a list/dict

# Hard Way:

collection = [1,2,3]

i = 0
print('Hard Way\n')
for item in collection:
    print(item)
    i += 1

# Easy Way

print('\nEasy Way\n')
for i, value in enumerate(collection):
    print(item)

Hard Way

1
2
3

Easy Way

3
3
3


In [128]:
# Example - create some dict that is a mapping between two lists

some_list = ['foo', 'bar', 'baz']
mapping = {}

for i,v in enumerate(some_list):
    mapping[i] = v
    
print(mapping)

# Really easy way to make a dictionary
print(dict(mapping))

{0: 'foo', 1: 'bar', 2: 'baz'}
{0: 'foo', 1: 'bar', 2: 'baz'}


In [85]:
# Sorting

sorted([5,2,1,12,2])

[1, 2, 2, 5, 12]

In [86]:
sorted('horse race')

[' ', 'a', 'c', 'e', 'e', 'h', 'o', 'r', 'r', 's']

In [89]:
# Zipping

seq1 = ['foo', 'bar', 'baz']
seq2 = ['dog', 'cat', 'three']

zipped = zip(seq1, seq2)
list(zipped)

[('foo', 'dog'), ('bar', 'cat'), ('baz', 'three')]

In [92]:
# Combining zip and enumerate for ultimate pythonism


seq1 = ['foo', 'bar', 'baz']
seq2 = ['dog', 'cat', 'three']

for i, (a,b) in enumerate(zip(seq1, seq2)):
    print('{0}: {1}, {2}'.format(i, a, b))

0: foo, dog
1: bar, cat
2: baz, three


In [108]:
# Unzip in a hacky way using '*'

pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'),('Schilling', 'Curt')]
first_names, last_names = zip(*pitchers)
first_names

('Nolan', 'Roger', 'Schilling')

In [112]:
# The above uses the '*' operator to 'unpack' the list. See below example

a,*b = pitchers
print(a)
print(b)

('Nolan', 'Ryan')
[('Roger', 'Clemens'), ('Schilling', 'Curt')]


#### Dicts (Also called key-value pairs or hashmaps)

In [122]:
ex_dict = {'a': 'some value', 'b': [1, 2, 3, 4]}

In [123]:
# Access dict item
ex_dict['a']

'some value'

In [124]:
# Replace item
ex_dict['b'] = [5, 6, 7, 8]
print(ex_dict)

{'a': 'some value', 'b': [5, 6, 7, 8]}


In [125]:
# Add item
ex_dict['fish'] = 'red'
print(ex_dict)

{'a': 'some value', 'b': [5, 6, 7, 8], 'fish': 'red'}


In [126]:
# Check for a key
'b' in ex_dict

True

In [127]:
# Two ways to delete: del which deletes the k/v or pop which returns the value and deletes key

del ex_dict['b']
print(ex_dict.pop('a'))

some value


In [135]:
# Keys and values
print(list(ex_dict.keys()))
print(list(ex_dict.values()))

['fish']
['red']


##### Merging two dicts

In [138]:
d1 = {'red': 1, 'blue': 2}
d2 = {'green': 3, 'purple': 4}
d1.update(d2)
print(d1)

# Note: changes in place

{'red': 1, 'blue': 2, 'green': 3, 'purple': 4}


#### Creating dicts from sequences

In [143]:
# dict is essentially a collection of 2-tuples
# Simple way to create a dict out of two lists

key_list = ['dog', 'cat', 'rabbit']
value_list = [1,2,3]

mapping = {}
for key, value in zip(key_list, value_list):
    mapping[key] = value
    
mapping

{'dog': 1, 'cat': 2, 'rabbit': 3}

##### Setting default values

In [147]:
# Hard way:

if key in some_dict:
    value = some_dict[key]
else:
    value = default

In [151]:
# can replace with get method (or pop):

ex_dict = {'dog':1, 'cat':2}
default = 3
print(ex_dict.get('dog', default))
print(ex_dict.get('baz', default))

1
3


#### Sets

A set is an unordered collection of unique elements

In [152]:
s1 = set([2,3,2,4,1,4,2,1,6])
print(s1)

{1, 2, 3, 4, 6}


In [154]:
# another way to create sets
s2 = {5,6,7,8}

In [156]:
# Supports all the mathetmatical set operations

s1.intersection(s2)

{6}

In [161]:
s1.union(s2)

# or

s1 | s2

{1, 2, 3, 4, 5, 6, 7, 8}

In [162]:
s1.difference(s2)

# or

s1 - s2

{1, 2, 3, 4}

In [163]:
s2.difference(s1)

# or

s2 - s1

{5, 7, 8}

In [164]:
s1.symmetric_difference(s2)

# or

s1 ^ s2

{1, 2, 3, 4, 5, 7, 8}

In [167]:
s1.issubset(s2)

# or

s1 <= s2

False

In [168]:
s1.issuperset(s2)

# or

s1 >= s2

False

In [169]:
# Plus many others

### List/Set/Dict Comprehension

##### List Comprehension

In [174]:
# List comprehension replaces this -->

# result = []
# for val in collection:
#     if condition:
#         result.append(expr)
        
# ...with this -->

# [expr for val in collection if condition]

In [186]:
# Example of list comprehension:

strings = ['a', 'as', 'bat', 'car', 'dove']
[x.upper() for x in strings if len(x) > 2]

# Note that the output is a list

['BAT', 'CAR', 'DOVE']

In [192]:
# Can also do nested list comprehensions

name_list = [['John', 'Emily', 'Valerie', 'Mickels'], ['Maria', 'Juan', 'Pilar', 'Steven']]

# HARD WAY

names_of_interest = []
for names in name_list:
    double_e = [name for name in names if name.count('e') >= 2]
    names_of_interest.extend(double_e)

print(names_of_interest)

# EASY WAY

result = [name for names in name_list for name in names if name.count('e') >= 2]
print(result)

['Valerie', 'Steven']
['Valerie', 'Steven']


##### Set/Dict Comprehension

In [183]:
# Create a set based on lengths of words

strings = ['a', 'as', 'bat', 'car', 'dove']
unique_lengths = {len(x) for x in strings}
print(unique_lengths)

{1, 2, 3, 4}


In [185]:
# Create a location mapping a dictionary
strings = ['a', 'as', 'bat', 'car', 'dove']
loc_mapping = {val : index for index, val in enumerate(strings)}
print(loc_mapping)

{'a': 0, 'as': 1, 'bat': 2, 'car': 3, 'dove': 4}


### Functions

In [194]:
# Note, all functions in Python are represented as objects
# So we can pass in a list of functions to to other functions like this:

def clean_strings(strings, ops):
    result = []
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result

strs = ['Alice', 'Bob  ', 'charlie']
clean_ops = [str.strip, str.upper]

print(clean_strings(strs, clean_ops))

['ALICE', 'BOB', 'CHARLIE']


#### Map

In [197]:
# Applies a function to a sequence of some kind

strs = ['Alice', 'Bob  ', 'charlie']

for string in map(str.upper, strs):
    print(string)

ALICE
BOB  
CHARLIE


#### Lambda Functions

In [204]:
# Lambda functions are the Python version of anonymous functions

def short(x):
    return x*2

lambda_short = lambda x: x*2

In [205]:
# Another practical example

def apply_to_list(some_list, f):
    return [f(x) for x in some_list]

ints = [4, 0, 1, 2,6]
apply_to_list(ints, lambda x: x*2)

[8, 0, 2, 4, 12]

#### Generators

In [207]:
# Python features an iterator protocol, a generic way to make objects iterable
# e.g. iterating over a dict yields the dict keys

some_dict = {'a':1, 'b':2, 'c':3}
for key in some_dict:
    print(key)

a
b
c


In [215]:
# This can be called directly using the 'iter' keyword

some_dict = {'a':1, 'b':2, 'c':3}

dict_iterator = iter(some_dict)

# Can pass into methods expecting a list or list-like object

print(list(dict_iterator))
# print(max(dict_iterator))

['a', 'b', 'c']


#### Generators

In [216]:
# A generator is a concise way to construct a new iterable object
# Generator returns multiple results lazily, pausing until next is requested
# Use the 'yield' keyword instead of 'return' to create a generator

def squares(n=10):
    for i in range(1, n+1):
        yield i ** 2

gen = squares()
gen

<generator object squares at 0x122978570>

In [217]:
# Gets evaluated when you iterate over it

for x in gen:
    print(x, end= ' ')

1 4 9 16 25 36 49 64 81 100 

In [220]:
# Can also use an expression to make generators more concise (similar to list comp)

gen = [x ** 2 for x in range(10)]
for x in gen:
    print(x, end= ' ')

0 1 4 9 16 25 36 49 64 81 

In [221]:
# Can use generators in place of list comprehensions 

sum(x ** 2 for x in range(100))

328350

In [232]:
# Lots of useful generators in itertools here
# https://docs.python.org/3/library/itertools.html

# A couple demonstrated:

In [231]:
# Don't forget to import itertools!
import itertools as it

# Combinations

ex_set = {1, 2, 6, 3, 135,6, 6231, 6, 4}
list(it.combinations(ex_set, 2))

[(1, 2),
 (1, 3),
 (1, 4),
 (1, 6),
 (1, 135),
 (1, 6231),
 (2, 3),
 (2, 4),
 (2, 6),
 (2, 135),
 (2, 6231),
 (3, 4),
 (3, 6),
 (3, 135),
 (3, 6231),
 (4, 6),
 (4, 135),
 (4, 6231),
 (6, 135),
 (6, 6231),
 (135, 6231)]

In [235]:
# Permutations
import itertools as it

ex_set = {1, 2, 6, 3, 135,6, 6231, 6, 4}
list(it.permutations(ex_set, 2))

[(1, 2),
 (1, 3),
 (1, 4),
 (1, 6),
 (1, 135),
 (1, 6231),
 (2, 1),
 (2, 3),
 (2, 4),
 (2, 6),
 (2, 135),
 (2, 6231),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 6),
 (3, 135),
 (3, 6231),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 6),
 (4, 135),
 (4, 6231),
 (6, 1),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 135),
 (6, 6231),
 (135, 1),
 (135, 2),
 (135, 3),
 (135, 4),
 (135, 6),
 (135, 6231),
 (6231, 1),
 (6231, 2),
 (6231, 3),
 (6231, 4),
 (6231, 6),
 (6231, 135)]

### Files and the Operating System

In [257]:
path = './crushp.txt'

lines = [line for line in open(path)]
lines

['This is an example text file\n',
 'I am practicing my VIM skills\n',
 'I am practicing my VIM skills\n',
 'I am practicing my VIM skillz\n']

In [258]:
# Can access by line as with an array

lines[0]

'This is an example text file\n'

In [263]:
# Very important! Like C, must remember to close files or there will be memory leaks

path = './crushp.txt'

f = open(path)
f.close()

In [282]:
# Can also use the "with" statement to close a file at the end of the block

path = './crushp.txt'
lines = [];

with open(path) as f:
    lines = [line for line in open(path)]
print(lines)

print(f.closed)

['This is an example text file\n', 'I am practicing my VIM skills\n', 'I am practicing my VIM skills\n', 'I am practicing my VIM skillz\n']
True


In [288]:
# Alternative way to read lines into a file

path = './crushp.txt'
lines = [];

with open(path) as f:
    lines = f.readlines()
print(lines)

print(f.closed)

['This is an example text file\n', 'I am practicing my VIM skills\n', 'I am practicing my VIM skills\n', 'I am practicing my VIM skillz\n']
True


In [287]:
# Use 'read' and 'write' methods just like you would in C

path = './crushp.txt'

f = open(path)
f.read(10)

# Goto a specific character in the file (by integer)
f.seek(10)

# Where are you in the file?
f.tell()

# close the file
f.close()

f.closed

True

## NumPy

#### ndarray objects are the basis of NumPy computing

In [294]:
# Creates a matrix of dimensions x,y with values selected at random from a standard norm dist.

np.random.randn(2,3)

array([[-0.10043225,  0.69225456, -0.5355994 ],
       [-1.17543133, -0.53292317, -0.31573878]])

In [297]:
# Can do scalar or vector operations

data = np.random.randn(2,3)
print(data)
print(data * 10)
print(data + data)

[[-0.63863108 -1.14957662 -1.68275552]
 [-0.44124116  1.74118678  0.88941171]]
[[ -6.38631084 -11.49576625 -16.82755515]
 [ -4.41241159  17.41186784   8.89411708]]
[[-1.27726217 -2.29915325 -3.36551103]
 [-0.88248232  3.48237357  1.77882342]]


In [302]:
# Note, an ndarray is a generic multi-dim container for data that is all the same type

data = np.random.randn(2,3)

print(data.shape)
print(data.dtype)

(2, 3)
float64
