# Numpy

First, install `numpy` and load it. 

In [2]:
import numpy as np

### NumPy Arrays
#### Overview
- Support vectorized operations, while lists don't
- In the context of high-level languages like Python, Matlab, and R, the term vectorization describes the use of optimized, pre-compiled code written in a low-level language (e.g. C) to perform mathematical operations over a sequence of data
- cannot be changed in size -- have to create a new array or overwrite existing one
- each has one and only one dtype - all items in each array are of that same dtype
- an equivalent numpy array occupies much less space than a python list
- http://www.numpy.org

In [142]:
#Array Slicing: Accessing Subarrays

#syntax
#x[start:stop:step]

#one dimension array
x = np.arange(10)
print(x)
#find first five elements from array x
x[0:5]
x[:5]
#find a middle subarray 
x[3:7]

#find even numbers from x
x[0::2]

#find odd numbers from x
x[1::2]

[0 1 2 3 4 5 6 7 8 9]


array([1, 3, 5, 7, 9])

In [22]:
#Multidimensional subarrays
np.random.seed(0)
x = np.random.randint(20,size=(4,3))

print(x)
#find first two rows, three columns
x[:2]
x[:2,:3]

#second row

x[1:2]
x[1]

#first column
x[:, 0]

#first two columns, 2nd and 4th rows
x[1::2, :2 ]


[[12 15  0]
 [ 3  3  7]
 [ 9 19 18]
 [ 4  6 12]]


array([[3, 3],
       [4, 6]])

In [30]:
#fancy indexing
#pass an array of indices to access multiple array elements at once

rand = np.random.RandomState([42])
x = rand.randint(100, size=10)

#x =np.random.randint()

print(x)
#access three different elements
print([x[3],x[4],x[6]])

#pass the indice to x
ind = [3,4,6]
print(x[ind])

#pass a two-dimension array of indices
ind1 = np.array([3,4,5,7]).reshape((2,2))
print(ind1)

print(x[ind1])

#in a multi-dimension array
y = np.arange(12).reshape((3, 4))
row = np.array([0, 1, 2])
col = np.array([1, 2, 3])
print(y)
print(y[row, col])

[29 81 87 16 94 41 39 51 80 52]
[16, 94, 39]
[16 94 39]
[[3 4]
 [5 7]]
[[16 94]
 [41 51]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[ 1  6 11]


In [32]:
#combined indexing
y[2, [2, 1, 3]]

y[1:,[2, 1, 3]]

array([[ 6,  5,  7],
       [10,  9, 11]])

In [40]:
# using fancy indexing in data science projects
#select random points

#create a two-dimension normal distribution
mean = [0, 0]
cov = [[1, 2],
[2, 5]]
x = rand.multivariate_normal(mean, cov, 100)

#x = np.random.normal()
#print(x)
x.shape
#select 20 random points
#create indices:choosing 20 random indices with no repeats
indices = np.random.choice(100, 20, replace = False)
print(indices)

selection = x[indices]
print(selection)
selection.shape


[52 72  5 18 87 85 76 17 93 33 12 77 55 66 45  9 15  7 37  6]
[[-0.39957106  0.6637373 ]
 [ 0.45772101  2.5619046 ]
 [-0.83792802 -1.39129177]
 [ 0.60288567  0.81661868]
 [-0.31025887  0.17568315]
 [ 1.34464683  3.70308842]
 [ 0.07597745  1.53765941]
 [-0.66941676 -1.04150915]
 [-0.4336274  -0.83795207]
 [-1.72187605 -3.75910665]
 [-1.11332208 -2.81319823]
 [-0.29298523 -2.85517884]
 [-0.31335496  0.05983651]
 [-0.91514822 -1.2916243 ]
 [ 1.56357638  4.91349762]
 [-0.58062407 -2.31928368]
 [ 1.29960622  1.54406033]
 [-1.25622051 -3.97365376]
 [-0.35401992  0.69125164]
 [ 0.67691295  0.3136968 ]]


(20, 2)

In [48]:
#plot
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set() # for plot styling

plt.scatter(x[:, 0],x[:, 1])

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f7e411997c0>

In [46]:
%matplotlib notebook
plt.scatter(x[:, 0],x[:, 1], alpha = 0.3)
plt.scatter(selection[:,0],selection[:,1], c ='blue',s = 50, alpha = 0.4)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7f7e3380b4f0>

In [None]:
#common vectorized computation methods
array1 + array2
array1 * array2
np.dot(array1, array2)
np.mean
np.max(array, axis = None) # or '0' for column-wise, or '1' for row-wise
np.maximum(array1, array2)
np.min(array, axis = None) # or '0' for column-wise, or '1' for row-wise
np.minimum(array1, array2)
np.median
np.cumsum
np.sqrt
np.exp
np.add
np.subtract

In [7]:
array1 = np.arange(12).reshape((4,3))
array2 = np.random.randint(1,20,(3,4))

np.dot(array1, array2)
#np.mean(array1)
#print(array1)
#np.max(array1,axis=1)
#np.min(array1, axis = 0)

array([[ 29,  16,  42,   9],
       [149,  97, 147,  36],
       [269, 178, 252,  63],
       [389, 259, 357,  90]])

### Broadcasting
a set of rules for applying binary ufuncs (addition, subtraction, multiplication, etc.) on arrays of different sizes.

- Rule 1: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.
- Rule 2: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.
- Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

In [6]:
#Broadcasting
a1 = np.arange(3)
a2 = np.ones((3,3),dtype='int')
a3 = np.ones((3,2),dtype='int')

print(a1.shape,'\n',a2.shape,'\n',a3.shape)
a1 + 5

print(a3)
#a3 + a1


(3,) 
 (3, 3) 
 (3, 2)
[[1 1]
 [1 1]
 [1 1]]


In [3]:
x = np.arange(1, 6)
y = np.arange(2, 7)

#np.add(x,y)

#reduce: repeatedly applies a given operation to the elements of an array until only a single result remains.
np.add.reduce(x)
np.multiply.reduce(x)

#store all the intermediate results of the computation
np.add.accumulate(x)

#create a multiplication table
#np.outer : computer outer product of two vectors


array([ 1,  3,  6, 10, 15])

In [12]:
#Working with Boolean Arrays
np.random.seed(1)
x = np.random.randint(1,100,(3,4))

print(x)
x < 6
# how many values less than 6?
np.count_nonzero(x < 6)
np.sum(x < 6)

# how many values less than 6 in each row?
np.sum(x < 6, axis = 1)

# are there any values greater than 8?
np.any(x > 8)

# are all values in each row less than 8?
np.all(x > 8, axis = 1)

# what values are greater than 8?
x[x > 8]


[[38 13 73 10]
 [76  6 80 65]
 [17  2 77 72]]


array([38, 13, 73, 10, 76, 80, 65, 17, 77, 72])

In [15]:
#example

#president_heights.csv

#https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/data/president_heights.csv

import pandas as pd
url = 'https://raw.githubusercontent.com/jakevdp/PythonDataScienceHandbook/master/notebooks/data/president_heights.csv'
df = pd.read_csv(url,index_col = 0)
df

Unnamed: 0_level_0,name,height(cm)
order,Unnamed: 1_level_1,Unnamed: 2_level_1
1,George Washington,189
2,John Adams,170
3,Thomas Jefferson,189
4,James Madison,163
5,James Monroe,183
6,John Quincy Adams,171
7,Andrew Jackson,185
8,Martin Van Buren,168
9,William Henry Harrison,173
10,John Tyler,183


In [19]:
height = np.array(df['height(cm)'])

array([189, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175,
       178, 183, 193, 178, 173, 174, 183, 183, 168, 170, 178, 182, 180,
       183, 178, 182, 188, 175, 179, 183, 193, 182, 183, 177, 185, 188,
       188, 182, 185])

In [21]:
height = np.array(df['height(cm)'])

#check basic statistics
print("Mean height: ", height.mean())
print("Standard deviation:", height.std() )
print("Minimum height: ", height.min())
print("Maximum height: ", height.max())

Mean height:  179.73809523809524
Standard deviation: 6.931843442745892
Minimum height:  163
Maximum height:  193


In [30]:
#find heights greater than 165cm and less than 185cm
print(height[ (height > 165) & (height < 185)])

[170 183 171 168 173 183 173 173 175 178 183 178 173 174 183 183 168 170
 178 182 180 183 178 182 175 179 183 182 183 177 182]
[170 163 183 171 168 173 183 173 173 175 178 183 178 173 174 183 183 168
 170 178 182 180 183 178 182 175 179 183 182 183 177 182]


In [35]:
#sorting
np.random.seed(1)
x = np.random.randint(1,100,(3,4))

print(x)
#np.sort()
#sort arrays without modifying the input
np.sort(x)
np.sort(x, axis = 0)

#np.argsort(): returns the indices of the sorted elements
type(np.argsort(x))


[[38 13 73 10]
 [76  6 80 65]
 [17  2 77 72]]


numpy.ndarray

In [45]:
#Numpy structured arrays
#provide efficient storage for compound, heterogeneous data

name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25, 45, 37, 19]
weight = [55.0, 85.5, 68.0, 61.5]

# Use a compound data type for structured arrays
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'),
'formats':('U10', 'int', 'float')})

data['name'] = name
data['age'] = age
data['weight'] = weight
data

# Get all names
data['name']

# Get first row of data
data[0]

# Get the name from the last row


# Get names where age is under 30


('Alice', 25, 55.)