# Jumping into numpy

**Bernease Herman**  
**DSSG Summer 2015**  
**July 6, 2015**

### Why numpy over base Python?

In [None]:
# Create a function to multiply by 3 (or any scalar)
def multiply_by_scalar(my_list, my_scalar):
    for i, val in enumerate(my_list):
        my_list[i] = val * my_scalar
    return my_list

my_list = range(1, 6)
multiply_by_scalar(my_list, 3)

In [None]:
import numpy as np

# Do the same thing using numpy
my_array = np.arange(1, 6)
my_array * 3

In [None]:
my_longer_list = range(1000000)
%timeit multiply_by_scalar(my_longer_list, 4)

In [None]:
my_longer_array = np.arange(1000000)
%timeit my_longer_array * 4

## Arrays

### Creating arrays

In [None]:
# Creating array from Python list
creation_list = [4, 5, 9, 1]
np.array(creation_list)

In [None]:
# Creating array with elements from 0 to 10
np.arange(10)

In [None]:
# Creating array that counts by fives from 5 to 25
np.arange(5, 50, 5)

In [None]:
# Creating array with 100 steps from 0 to 1
np.linspace(0, 1, 100)

# Notice that both bounds are inclusive, unlike np.arange()

In [None]:
# Creating array of zeros with 7 elements
np.zeros(7)

In [None]:
# Creating array of ones with 3 elements
np.ones(3)

In [None]:
# How do we create array of threes with 5 elements?
# Surely np.threes() won't work.
np.zeros(5) + 3

In [None]:
# Notice the dots following each number?
# These indicate that they are all some sort of floating point number.

# What if I wanted an integer?
np.zeros(5).astype(np.int64) + 3
#np.zeros(5, dtype=np.int8) + 3

In [None]:
# There's a dict that contains numpy's scalar types
np.sctypes

### Reshaping arrays

In [None]:
# Note that numpy arrays can be multidimensional
arr2d = np.array([[1, 2], [3, 4]])

# Same result if reshape array
arr1d = np.arange(1, 5)
arr1d.reshape(2, 2)
# np.reshape(arr1d, (2, 2))

In [None]:
# Inversely, can flatten
arr2d.ravel()

### Indexing and slicing in numpy

In [None]:
# One dimensional indexing and slicing
my_arr = np.arange(25)

In [None]:
my_arr[:10]

In [None]:
my_arr[-7:]

In [None]:
my_arr[10:20]

In [None]:
# Two dimensional indexing and slicing

# Index using Python (non-pandas) multidimensional list
my_list = [[1, 2, 3, 4, 5], [2, 4, 6, 8, 10], [3, 6, 9, 12, 15]]
my_list[1][2]

In [None]:
my_arr = np.array(my_list)
my_arr[1, 2]

In [None]:
my_arr[1:3,]

In [None]:
my_arr[0:1, -2:]

In [None]:
my_arr[:]

In [None]:
index = np.where(my_arr % 2)
my_arr[index]

In [None]:
my_arr[index] = 0
my_arr

## Matrix
These better match what we think of as a matrix in mathematics. For example, multiplication operator generates matrix multiplication and not element-wise multiplication. They are only 1D or 2D.

In [None]:
# Create 2D array from above
arr2d = np.array([[1, 2], [3, 4]])

# Create matrix similar to arr2d
mx2d = np.matrix([[1, 2], [3, 4]])
# np.matrix(arr2d)

In [None]:
# Let's multiply both array and matrix form by another
multArr = np.arange(5, 9).reshape(2, 2)
multMx = np.matrix(multArr)

print "Array:\n", arr2d * multArr, "\n"
print "Matrix:\n", mx2d * multMx

In [None]:
# 3D numpy array work
arr3d = np.zeros((3, 3, 3))

In [None]:
# 3D numpy matrices error
mx3d = np.matrix(arr3d)

# Jumping into pandas

In [None]:
import pandas as pd

## Series
Series in pandas are one dimensional dict-like objects that provide us with the operations gained from numpy. Like Python dicts, they allow indexing using any type.

### Constructing a series

In [None]:
# Creating series of 1000 elements with random numbers
my_series = pd.Series(np.random.randn(1000))
my_series

In [None]:
# Creating series of mixed type using list
other_series = pd.Series(["a", 7, (1, 2, 3), {"cat": "meow", "dog": "bark"}])
other_series

In [None]:
# Creating an employee time card series with hours worked with day index
timeCard1 = pd.Series([8, 8, 8, 8, 8], index=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"])
timeCard1

### Indexing a series

In [None]:
# Notice you can still access using 0 .. n-1 index
timeCard1[2]

In [None]:
# But you can also use the index you've specified
timeCard1["Wednesday"]

In [None]:
# Check if day is in the series using idiomatic Python
print "Friday" in timeCard1
print "Sunday" in timeCard1

In [None]:
# Create another series to hold overtime pay multiplier
WEEKDAYS = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
overtimeMultiplier = pd.Series([1.5, 1, 1, 1, 1, 1, 1.5], index=WEEKDAYS)
overtimeMultiplier

### Operations on series

In [None]:
# Hourly wage set to $8/hr
hourlyWage = 8.0

weekGrossPay1 = timeCard1 * overtimeMultiplier * hourlyWage
weekGrossPay1

In [None]:
# Let's get rid of those NaNs
weekGrossPay1.fillna(0)

In [None]:
# Let's find the total gross pay for Employee #1
weekGrossPay1.sum()

## Data frames
Think of pandas data frames as feature-enhanced versions of those in R or record arrays in numpy. These can be considered the combination of multiple Series.

### Constructing a data frame

In [None]:
# Let's create a few more employee time cards
timeCard2 = pd.Series([0, 8, 8, 8, 0, 0, 4], index=WEEKDAYS)
timeCard3 = pd.Series([10, 10, 10, 10], index=["Monday", "Tuesday", "Wednesday", "Thursday"])

# Let's combine to create a data frame
allTimeCards = pd.DataFrame({"Barb": timeCard1,
                             "Susan": timeCard2,
                             "Jack": timeCard3})
allTimeCards

In [None]:
% cd /Users/bernease/talks/2015-07-06-pandas-numpy/

In [None]:
!head -n 5 rivera.csv

In [None]:
pd.read_csv("rivera.csv")

### Summarize or view data frame

In [None]:
# We can look at the dtypes of each column
allTimeCards.dtypes

In [None]:
allTimeCards.describe()

In [None]:
print "Head:\n", allTimeCards.head(), "\n"
print "Tail:\n", allTimeCards.tail()

### Indexing and slicing

In [None]:
# Index for using 0 .. n-1 index for Saturday
allTimeCards.ix[2]

In [None]:
# Index for Susan's hours on Friday
allTimeCards.ix[0, 2]

In [None]:
# Of course, you can do the same using the names given
allTimeCards["Susan"]["Friday"]

## Resources to continue learning
### Greg Reda's blog
- http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/
- http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/

### numpy and scipy book on DSS bookshelf