# Advanced Numpy
Miscellaneous Topics (doesn't need to be read linearly)

In [1]:
import numpy as np

### A.1 ndarray Object Internals

ndarray internally consists of the following:
    * A pointer to data - a block of data in RAM or in a memory-mapped file
    * The data type or dtype, describing fixed-size value cells in the array
    * A tuple indiciating the array's shape
    * A tuple of strides, integers indicating the number of bytes to "step" in order to advance one element along a dimension

In [3]:
# for example a 10 x 5 arrray would have shape (10,5)
np.ones((10, 5)).shape

(10, 5)

### A.2 Advanced Array Manipulation

#### Reshaping Arrays

In [5]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [6]:
arr.reshape((4,2))  # default row major order (order = 'C')

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [7]:
arr.reshape((4,2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [20]:
# a shape dimension can be -1, in which case the value will be inferred from the data
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [17]:
# since an array's shape attribute is a tuple, it can be passed to reshape too
other_arr = np.ones((3, 5))
other_arr.shape

(3, 5)

In [18]:
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [21]:
# the opposite operation of reshape for n-dimension is known as flattening or raveling
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [22]:
# ravel does not produce a copy of the values if the values in the result
# were contiguous in the original array
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [23]:
# flatten behaves like ravel but it always returns copy of the data
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

#### Concatenating and Splitting Arrays

In [25]:
# np.concatenate takes a sequence (tuple, list, etc.) of arrays and joings them in order
# along the input axis
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [26]:
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [28]:
# vstack and hstack do the same and are convenient
np.vstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [29]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [30]:
# split, on the other hand, slices apart an array into multiple arrays along an axis
arr = np.random.randn(5, 2)
arr

array([[ 1.22666963,  0.50609973],
       [-1.90598359,  0.70789374],
       [ 0.59110259, -0.16301987],
       [ 1.39209543, -1.55159112],
       [-0.43884466, -0.08773551]])

In [64]:
first, second, third = np.split(arr, [1, 3])

In [65]:
first

array([[ 1.22666963,  0.50609973]])

In [66]:
second

array([[-1.90598359,  0.70789374],
       [ 0.59110259, -0.16301987]])

In [67]:
third

array([[ 1.39209543, -1.55159112],
       [-0.43884466, -0.08773551]])

In [68]:
# the value [1, 3] indicate the indices at which to split the array into pieces

#### Stacking helpers: r_  and c__
Two special objects in the numpy namespace, r_ and c__, that make staking arrays more concise

In [69]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-1.48978701,  0.47289782],
       [ 1.04083256,  0.78229564],
       [-0.5152948 , -2.79236818]])

In [70]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [-1.48978701,  0.47289782,  3.        ],
       [ 1.04083256,  0.78229564,  4.        ],
       [-0.5152948 , -2.79236818,  5.        ]])

In [71]:
np.c_[1:6, -10:-5] #  can also translate slices to arrays

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

#### Repeating Elements: tile and repeat
Two useful tools for repeating or replicating arrays to produce larger arrays are repeat and tile functions. repeat replicates each element in an array some number of times

In [72]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [73]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [74]:
# if you pass an integer, each element will be repeated that number of times
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [75]:
# function also applies to multidimensional arrays 
arr = np.random.randn(2 ,2)
arr

array([[-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281]])

In [76]:
arr.repeat(2, axis=0)  # if no axis is passed the array is flatten (no bueno)

array([[-1.30145416,  0.63862624],
       [-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281],
       [ 0.01530479, -1.15800281]])

In [78]:
# also passing an array of intergers in a multidimensional array
# repeats a given slice a different number of times
arr.repeat([2, 3], axis=0)

array([[-1.30145416,  0.63862624],
       [-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281],
       [ 0.01530479, -1.15800281],
       [ 0.01530479, -1.15800281]])

In [79]:
# tile is a shortcut for stacking copies of an array along an axis
# visually it's a akin to "laying down tiles"
arr

array([[-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281]])

In [80]:
np.tile(arr, 2)  # 2nd argument is number of tiles

array([[-1.30145416,  0.63862624, -1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281,  0.01530479, -1.15800281]])

In [81]:
# 2nd argument can also be a tuple indicating the layout of the "tiling"
np.tile(arr, (2, 1))

array([[-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281],
       [-1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281]])

In [83]:
np.tile(arr, (3,2))

array([[-1.30145416,  0.63862624, -1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281,  0.01530479, -1.15800281],
       [-1.30145416,  0.63862624, -1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281,  0.01530479, -1.15800281],
       [-1.30145416,  0.63862624, -1.30145416,  0.63862624],
       [ 0.01530479, -1.15800281,  0.01530479, -1.15800281]])

#### Fancy Indexing Equivalents: take and put

In [85]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [86]:
# these alternative ndarrays are useful in the special case of only making
# a section on a single axis
arr.take(inds)

array([700, 100, 200, 600])

In [87]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [89]:
# to take along other axes pass the axis keyword
inds = [2, 0, 2 , 1]
arr = np.random.randn(2, 4)
arr

array([[-0.73025075, -0.19281946, -0.70457923, -0.10312545],
       [-1.13087419,  0.25591202, -0.11847633, -1.40313941]])

In [90]:
arr.take(inds, axis=1)

array([[-0.70457923, -0.73025075, -0.70457923, -0.19281946],
       [-0.11847633, -1.13087419, -0.11847633,  0.25591202]])

### A.3 Broadcasting
Broadcasting describes how arithmetic works between arrays of different shapes.

In [93]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [94]:
arr * 4  # simple example -> combining scalar with array

array([ 0,  4,  8, 12, 16])

In [99]:
# demean each column of an array by subtracting the column means
arr = np.random.randn(4, 3)
arr.mean(0)

array([ 0.41790824, -0.33017259,  0.62400616])

In [100]:
demeaned = arr - arr.mean(0)
demeaned

array([[-0.54667542,  2.32233909, -0.64380894],
       [ 1.67458406, -2.22827782,  0.25540491],
       [-1.38773072, -0.48664595, -0.26397741],
       [ 0.25982208,  0.39258468,  0.65238144]])

In [101]:
demeaned.mean(0)

array([ -1.38777878e-17,   1.38777878e-17,   0.00000000e+00])

In [104]:
np.mean(dem, 0)

array([ -1.38777878e-17,   1.38777878e-17,   0.00000000e+00])

In [110]:
arr

array([[-0.12876718,  1.99216651, -0.01980278],
       [ 2.09249229, -2.55845041,  0.87941107],
       [-0.96982248, -0.81681853,  0.36002874],
       [ 0.67773032,  0.06241209,  1.27638759]])

In [123]:
row_means = arr.mean(1)
row_means.shape

(4,)

In [124]:
row_means.reshape((4, 1))


array([[ 0.61453218],
       [ 0.13781765],
       [-0.47553742],
       [ 0.67217667]])

In [128]:
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([  3.70074342e-17,  -3.70074342e-17,  -3.70074342e-17,
         1.11022302e-16])

In [129]:
arr

array([[-0.12876718,  1.99216651, -0.01980278],
       [ 2.09249229, -2.55845041,  0.87941107],
       [-0.96982248, -0.81681853,  0.36002874],
       [ 0.67773032,  0.06241209,  1.27638759]])

#### Broadcasting over Other Axes
Broadcasting rule - the broadcastin dims must be 1 in the smaller array

In [131]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.74329936,  1.37763432, -0.63433496],
       [ 1.95467464, -2.69626806,  0.74159342],
       [-0.49428506, -0.34128111,  0.83556617],
       [ 0.00555365, -0.60976457,  0.60421092]])

In the 3-d case, broadcastin over any of the dims is only a matter of reshaping the data to be shape-compatible. 
A common problem is needing to add a new acis with length 1 specifically for broadcasting purposes. Using reshape is a tedious option, thus numpy offers _np.newaxis_ 

In [132]:
arr = np.zeros((4, 4 ))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape

(4, 1, 4)

In [145]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]

array([[-1.6709092 ],
       [-0.58537092],
       [-0.39415056]])

In [146]:
arr_1d[np.newaxis, :]

array([[-1.6709092 , -0.58537092, -0.39415056]])

In [147]:
# if we had a 3-d array and want to demean axis 2 we write
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means  # 2-d array w/mean of each depth 

array([[ 0.53054661,  0.38167054,  0.35110379, -0.09823849],
       [-0.04013333, -0.2378565 , -0.00347303, -0.39256419],
       [ 0.57815688,  0.27810656,  0.24968356,  0.00372646]])

In [150]:
depth_means.shape

(3, 4)

In [153]:
depth_means

array([[ 0.53054661,  0.38167054,  0.35110379, -0.09823849],
       [-0.04013333, -0.2378565 , -0.00347303, -0.39256419],
       [ 0.57815688,  0.27810656,  0.24968356,  0.00372646]])

In [159]:
# mean values now in axis where broadcasting can be taken
depth_means[:, :, np.newaxis]

array([[[ 0.53054661],
        [ 0.38167054],
        [ 0.35110379],
        [-0.09823849]],

       [[-0.04013333],
        [-0.2378565 ],
        [-0.00347303],
        [-0.39256419]],

       [[ 0.57815688],
        [ 0.27810656],
        [ 0.24968356],
        [ 0.00372646]]])

In [158]:
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[  2.22044605e-17,   0.00000000e+00,  -9.99200722e-17,
          0.00000000e+00],
       [ -5.55111512e-18,   0.00000000e+00,  -1.66533454e-17,
          4.44089210e-17],
       [  4.44089210e-17,  -8.88178420e-17,   3.33066907e-17,
         -2.22044605e-17]])

In [177]:
# generalization of demeaning over an axis
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    # this generalizes things like [:, :, np.newaxis] to N-dim
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

#### Setting Array Values by Broadcasting
Same broadcasting rule governing arithmetic operations also applies to setting values array indexing. Simple case e.g.

In [179]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [184]:
# change a column to a 1-d array of values
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [187]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

### A.4 Advaned ufunc Usage