# PART II  NumPy, Pandas and Matplotlib

# Chapter 7 - NumPy Programming
## <<  Machine Learning with Python  >>      
###  C. Alex Hu  --  modified @ 2018/04/12  (1st version @ 2018/02/21)  

## <  CONTENTS >

[Intro to NumPy](#Intro to NumPy)
 
- [0. Import Library](#Import Library)
- [1. Creating arrays (產生陣列)](#Creating arrays)
- [2. 陣列檢查](#陣列檢查)
- [3. 陣列轉換 (Reshaping)](#Reshaping)
- [4. 堆疊陣列 (Stack Arrays)](#Stack Arrays)
- [5. 陣列元素選取 與 切片 (Slicing)](#Slicing)
- [6. Fancy indexing](#Fancy indexing)
- [7. Vectorized operations (向量化運算)](#Vectorized operations)
- [8. Broadcasting](#Broadcasting)

<a id='Intro to NumPy'></a>
# Intro to NumPy 
###  -- Python 標準程式庫的延伸，用於 Array (陣列) 與 Matrix (矩陣) 運算
####         

<a id='Import Library'></a>
## 0. Import Library

In [2]:
import numpy as np

<a id='Creating arrays'></a>
##   1.  Creating arrays  (產生陣列)
###   從 列表(lists) 產生陣列       
###   [ 注意 ]: 所有陣列元素的資料型別必須相同 (所以，會進行型別的強制轉換)

In [3]:
data1 = [1, 2, 3, 4, 5]                   # list
arr1 = np.array(data1)                    # 1d array
arr1
# type(arr1)

array([1, 2, 3, 4, 5])

In [18]:
data2 = [range(1, 5), range(5, 9)]        # list of lists
arr2 = np.array(data2)                    # 2d array
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [4]:
arr2.tolist()       # convert array back to listnp.zeros(10)

[[1, 2, 3, 4], [5, 6, 7, 8]]

In [5]:
np.zeros((3, 6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [4]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [6]:
np.linspace(0, 1, 5)  # 0 to 1 (inclusive) with 5 points; default = 50 points

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [8]:
np.logspace(0, 3, 8)         # 10^0 to 10^3 (inclusive) with 4 points

array([   1.        ,    2.6826958 ,    7.19685673,   19.30697729,
         51.79474679,  138.94954944,  372.75937203, 1000.        ])

In [9]:
int_array = np.arange(5)     #  arange 的用法如同 range, 不過， 它回傳一個陣列 (array) (而不是一個 list)
int_array

array([0, 1, 2, 3, 4])

In [10]:
float_array = int_array.astype(float)
float_array

array([0., 1., 2., 3., 4.])

<a id='陣列檢查'></a>
## 2. 陣列檢查

In [11]:
int_array.dtype      # int32

dtype('int64')

In [12]:
float_array.dtype    # float64

dtype('float64')

In [13]:
arr2.ndim            # 2

2

In [14]:
arr2.shape           # (2, 4) - axis 0 is rows, axis 1 is columns

(2, 4)

In [15]:
arr2.size            # 8 - total number of elements

8

In [16]:
len(arr2)            # 2 - size of first dimension (aka axis)

2

<a id='Reshaping'></a>
## 3. 陣列轉換 (Reshaping)

In [13]:
arr = np.arange(10, dtype=float).reshape((2, 5))
arr

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]])

In [15]:
print(arr.shape)
print(arr.reshape(5, 2))

(2, 5)
[[0. 1.]
 [2. 3.]
 [4. 5.]
 [6. 7.]
 [8. 9.]]


In [16]:
a = np.array([0, 1, 2, 3])  #  Adding new axses...
a_col = a[:, np.newaxis]    #  same as a_col = a[:, None]
a_col

array([[0],
       [1],
       [2],
       [3]])

In [17]:
print(a_col.T)

[[0 1 2 3]]


In [19]:
print(arr2.T)

[[1 5]
 [2 6]
 [3 7]
 [4 8]]


In [20]:
arr

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]])

In [22]:
arr_flt = arr.flatten()  # flatten 回傳一個原始的陣列複本 (a flat copy)
arr_flt
arr_flt[0] = 33
print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [24]:
arr_flt = arr.ravel()    #　Ravel returns a view of the original array whenever possible.
arr_flt
arr_flt[0] = 33
print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[33.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


## Q :  What's the difference between flatten() & ravel() ?
###    [ Hint ] :    arr                #  Run & Check the output of arr
###    

<a id='Stack Arrays'></a>
## 4. 堆疊陣列 (Stack Arrays)

In [25]:
#  Stack arrays
a = np.array([0, 1])
b = np.array([2, 3])
ab = np.stack((a, b))
print(ab)   
# ab

[[0 1]
 [2 3]]


In [26]:
ab = np.stack((a, b)).T
print(ab)

[[0 2]
 [1 3]]


In [27]:
print(np.hstack((a[:, None], b[:, None])))    #  same result as ab = np.stack((a, b)).T

[[0 2]
 [1 3]]


<a id='Slicing'></a>
## 5. 陣列元素選取 與 切片 (Slicing)

In [26]:
arr = np.arange(10, dtype=float).reshape((2, 5))
arr

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]])

In [27]:
arr[0]                     # 0th element (slices like a list)

array([0., 1., 2., 3., 4.])

In [28]:
arr[0, 3]                  # row 0, column 3: returns 3.0

3.0

In [29]:
arr[0][3]                  # alternative syntax

3.0

##  Slicing 語法 : start:stop:step    ( 預設值 :   start (= 0) : stop (= last) :step (= 1) )

In [30]:
arr[0, :] # row 0: returns 1d array ([1, 2, 3, 4])

array([0., 1., 2., 3., 4.])

In [31]:
arr[:, 0] # column 0: returns 1d array ([1, 5])

array([0., 5.])

In [34]:
arr[:, :2] # columns strictly before index 2 (2 first columns)

array([[0., 1.],
       [5., 6.]])

In [32]:
arr[:, 2:] # columns after index 2 included

array([[2., 3., 4.],
       [7., 8., 9.]])

In [36]:
arr = np.arange(10, dtype=float).reshape((2, 5))
print(arr)
# arr2 = np.array([])  #  Assign an empty list to arr2... 
arr2 = arr[:, 1:4]   #  columns between index 1 (included) and 4 (excluded)
print(arr2)

[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]
[[1. 2. 3.]
 [6. 7. 8.]]


In [37]:
arr2[0, 0] = 33   #   Slicing returns a view (not a copy)
print(arr2)
print(arr)

[[33.  2.  3.]
 [ 6.  7.  8.]]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


In [38]:
print(arr[0, ::-1])    #  reverse order

[ 4.  3.  2. 33.  0.]


<a id='Fancy indexing'></a>
## 6. Fancy indexing 
###  -  passing an array of indices to access multiple array elements at once. 
###  -  Fancy indexing returns a copy not a view.

In [34]:
# Integer array indexing...
arr2 = arr[:, [1,2,3]] # return a copy
print(arr2)
arr2[0, 0] = 44
print(arr2)
print(arr)

[[1. 2. 3.]
 [6. 7. 8.]]
[[44.  2.  3.]
 [ 6.  7.  8.]]
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [36]:
# Boolean array indexing...
arr2 = arr[arr > 5] # return a copy
print(arr2)
arr2[0] = 44
print(arr2)
print(arr)

[6. 7. 8. 9.]
[44.  7.  8.  9.]
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [37]:
arr[arr > 5] = 0
print(arr)

[[0. 1. 2. 3. 4.]
 [5. 0. 0. 0. 0.]]


In [38]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob'])
names == 'Bob' # returns a boolean array

array([ True, False, False,  True])

In [39]:
names[names != 'Bob']    # logical selection :  array(['Joe', 'Will'], dtype='<U4')
##  dtype='<U4' ref. to https://docs.scipy.org/doc/numpy-1.13.0/user/basics.rec.html

array(['Joe', 'Will'], dtype='<U4')

In [40]:
(names == 'Bob') | (names == 'Will') # keywords "and/or" don't work with boolean arrays

array([ True, False,  True,  True])

In [42]:
names[names != 'Bob'] = 'Joe' # assign based on a logical selection
np.unique(names) # set function

array(['Bob', 'Joe'], dtype='<U4')

<a id='Vectorized operations'></a>
## 7. Vectorized operations (向量化運算)

In [47]:
import numpy as np
import time

a = np.random.rand(1000000)
b = np.random.rand(1000000)

t1 = time.time()
c = np.dot(a, b)
t2 = time.time()
print(c)
print('Vectorization : ' + str(1000*(t2-t1)) + ' msec')

t1 = time.time()
c = 0
for i in range(1000000):
    c += a[i]*b[i]
t2 = time.time()
print(c)
print('For-loop : ' + str(1000*(t2-t1)) + ' msec')

250174.8358738987
Vectorization : 1.0001659393310547 msec
250174.83587388933
For-loop : 668.0212020874023 msec


In [49]:
nums = np.arange(5)
nums * 10 # multiply each element by 10

array([ 0, 10, 20, 30, 40])

In [50]:
nums = np.sqrt(nums) # square root of each element
nums

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ])

In [52]:
np.ceil(nums) # also floor, rint (round to nearest int)

array([0., 1., 2., 2., 2.])

In [53]:
np.isnan(nums) # checks for NaN

array([False, False, False, False, False])

In [54]:
nums + np.arange(5) # add element-wise

array([0.        , 2.        , 3.41421356, 4.73205081, 6.        ])

In [55]:
np.maximum(nums, np.array([1, -2, 3, -4, 5])) # compare element-wise

array([1.        , 1.        , 3.        , 1.73205081, 5.        ])

In [56]:
# Compute Euclidean distance between 2 vectors
vec1 = np.random.randn(10)
vec2 = np.random.randn(10)
dist = np.sqrt(np.sum((vec1 - vec2) ** 2))
print('vec1 = \n', vec1)
print('vec2 = \n', vec2)
print('\n dist = ', dist)

vec1 = 
 [-0.00814506  0.97339843  0.34427034 -0.17269653  0.07980665  1.37933797
 -0.12489372 -0.87848399 -0.49736899  0.65721929]
vec2 = 
 [-1.87413471 -0.60974754  1.12023621 -1.53021271 -1.0629231  -0.71155363
  2.14198212  1.01892023 -0.07881979 -1.42278792]

 dist =  5.2298549690341005


In [57]:
# math and stats
rnd = np.random.randn(4, 2) # random normals in 4x2 array
rnd

array([[ 0.05925295, -1.93629537],
       [-0.1281604 ,  0.38920798],
       [-0.16941869, -0.03855922],
       [ 1.2943117 , -0.01675097]])

In [58]:
rnd.mean()

-0.06830150361612652

In [59]:
rnd.std()

0.8358416964700472

In [60]:
rnd.argmin()  # Return the index of the element which has a minimum value

1

In [61]:
rnd.sum()

-0.5464120289290122

In [62]:
rnd.sum(axis=0) # sum of columns

array([ 1.05598556, -1.60239759])

In [63]:
rnd.sum(axis=1) # sum of rows

array([-1.87704243,  0.26104758, -0.20797791,  1.27756073])

In [64]:
# methods for boolean arrays
(rnd > 0).sum() # counts number of positive values

3

In [65]:
(rnd > 0).any() # checks if any value is True

True

In [66]:
(rnd > 0).all() # checks if all values are True

False

In [71]:
# random numbers
np.random.seed(1223) # Set the seed
np.random.rand(2, 3) # 2 x 3 matrix in [0, 1)

array([[0.5290768 , 0.16335436, 0.991743  ],
       [0.02575707, 0.64526444, 0.13992716]])

In [70]:
np.random.randn(10) # random normals (mean 0, sd 1)

array([ 0.59516425, -0.40590292, -0.93641996, -0.11051006,  0.56014653,
        0.80933459,  0.95579937, -0.42187951,  0.3469358 , -0.20012745])

In [78]:
np.random.randint(0, 2, 10) # 10 randomly picked 0 or 1

array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0])

<a id='Broadcasting'></a>
## 8. Broadcasting
-  https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html   
- The term broadcasting describes how numpy treats arrays with different shapes during arithmetic operations. 
- Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes. 
- Broadcasting provides a means of vectorizing array operations so that looping occurs in C instead of Python. It does this without making needless copies of data and usually leads to efficient algorithm implementations. 
- There are, however, cases where broadcasting is a bad idea because it leads to inefficient use of memory that slows computation.

- NumPy operations are usually done on pairs of arrays on an element-by-element basis. In the simplest case, the two arrays must have exactly the same shape, as in the following example:

In [66]:
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 2.0, 2.0])
a * b

array([2., 4., 6.])

In [67]:
a = np.array([1.0, 2.0, 3.0])
b = 2.0
a * b

array([2., 4., 6.])

## [ Rules of Broadcasting ]
### Starting with the trailing axis and working backward, Numpy compares arrays dimensions.
### • If two dimensions are equal then continues
### • If one of the operand has dimension 1 stretches it to match the largest one
### • When one of the shapes runs out of dimensions (because it has less dimensions than the other shape), Numpy will use 1 in the comparison process until the other shape’s dimensions run out as well.
###    

In [68]:
a = np.array([[ 0, 0, 0],
              [10, 10, 10],
              [20, 20, 20],
              [30, 30, 30]])
print('a =\n', a)
np.shape(a)

a =
 [[ 0  0  0]
 [10 10 10]
 [20 20 20]
 [30 30 30]]


(4, 3)

In [69]:
b = np.array([0, 1, 2])
print('b =\n', b)
np.shape(b)

b =
 [0 1 2]


(3,)

In [70]:
print(a + b)
np.shape(a+b)

[[ 0  1  2]
 [10 11 12]
 [20 21 22]
 [30 31 32]]


(4, 3)

In [71]:
print(a.T)
np.shape(a.T)

[[ 0 10 20 30]
 [ 0 10 20 30]
 [ 0 10 20 30]]


(3, 4)

In [72]:
print(a.T + b)
np.shape(a.T+b)

ValueError: operands could not be broadcast together with shapes (3,4) (3,) 

In [73]:
c = np.array([[[ 0, 1, 2], [10, 11, 12], [20, 21, 22], [30, 31, 32]],
            [[ 40, 41, 42], [50, 51, 52], [60, 61, 62], [70, 71, 72]]])
print('c =\n', c)
np.shape(c)

c =
 [[[ 0  1  2]
  [10 11 12]
  [20 21 22]
  [30 31 32]]

 [[40 41 42]
  [50 51 52]
  [60 61 62]
  [70 71 72]]]


(2, 4, 3)

In [74]:
print(c + a)
np.shape(c+a)

[[[  0   1   2]
  [ 20  21  22]
  [ 40  41  42]
  [ 60  61  62]]

 [[ 40  41  42]
  [ 60  61  62]
  [ 80  81  82]
  [100 101 102]]]


(2, 4, 3)

In [75]:
print(c + b)
np.shape(c+b)

[[[ 0  2  4]
  [10 12 14]
  [20 22 24]
  [30 32 34]]

 [[40 42 44]
  [50 52 54]
  [60 62 64]
  [70 72 74]]]


(2, 4, 3)