# Numpy basic

In [3]:
# import numpy
import numpy as np

# array from list
arr = np.array([10, 20, 30, 40, 50])

# print the array
print(arr)  # [10 20 30 40 50]

# dimensionality and shape
arr.ndim  # 1
arr.shape  # (5,)

# number of elements
len(arr)  # 5

# We can also make a 2-dimensional array
arr_2d = np.array([
    [10, 20, 30, 40, 50],
    [100, 200, 300, 400, 500]
])

# print the array
print(arr_2d)
# [[ 10  20  30  40  50]
#  [100 200 300 400 500]]

# dimensionality and shape
arr_2d.ndim  # 2
arr_2d.shape  # (5,)

# number of elements
len(arr_2d)  # 2

# number of nested elements
arr_2d.size  # 10

# type of object
type(arr_2d)  # <class 'numpy.ndarray'>

# type of internal data
arr_2d.dtype  # int64

# There are two basic rules for every numpy array
# 1) every element in the array must be of the same type
# 2) If an array's elements are also arrays, those inner arrays must have the same number of elements as each other
#    In other words, multidimensional arrays must be rectangular and not jagged.

# examples
np.array([1, 2, 3])  # good
np.array([1, 'hello', 3])  # bad array(['1', 'hello', '3'], dtype='<U21')
np.array([
    [1, 2, 3, 4],
    [5, 6, 5, 7],
])  # good
np.array([
    [1, 2, 3, 4],
    [5, 6]
])  # bad (results in a 1d array of 3 python lists instead of a 2d array)

[10 20 30 40 50]
[[ 10  20  30  40  50]
 [100 200 300 400 500]]


array([list([1, 2, 3, 4]), list([5, 6])], dtype=object)

In [4]:
# from list of lists
np.array([
    ['a', 'b'],
    ['c', 'd'],
    ['e', 'f']
])

# 3x5 array of 0s with np.zeros()
np.zeros(shape = (3, 5))


# initialize the array with any value
np.full(shape = (3, 5), fill_value = 'cat')

# sequence of integers from 1 to N
np.arange(start = 1, stop = 5, step = 1)  # note that start is inclusive while stop is exclusive

# random integers between 1 and 6
np.random.randint(low = 1, high = 7, size = (2, 3))

array([[2, 4, 1],
       [1, 3, 1]])

In [None]:
# Suppose we have the following 1d array with 5 elements
foo = np.array([10, 20, 30, 40, 50])

# access the ith element of foo using foo[i-1], just like a python list
foo[0]  # 10, 1st element
foo[1]  # 20, 2nd element

# modify  ith value
foo[1] = -20  # [ 10, -20,  30,  40,  50]

# access the last element in the array
foo[4]  # 50

# access the last element in the array v2
foo[len(foo) - 1]  # 50

# access the last element in the array v3 using negative indexing
foo[-1]  # returns the last element in the array
foo[-2]  # returns the 2nd to last element in the array

# out of bounds error
#foo[999]  # error

In [6]:
# access multiple elements at once
foo[[0, 1, 4]]  # [ 10, -20,  50]
foo[[0,1,0,1,0,1,0,1]]  # [ 10, -20,  10, -20,  10, -20,  10, -20]
foo[np.zeros(3, dtype = 'int64')]  # [10, 10, 10]

# use slicing, just like with python lists
# Slicing: foo[start index : end index : jump by]
foo[:2]  # [ 10, -20], get every element before index 2
foo[2:]  # [30, 40, 50], get every element from index 2 onward
foo[::2] # [10, 30, 50], get every other 2nd element

# modify multiple elements
foo[[0, 1, 4]] = [100, 200, 400]  # [100, 200,  30,  40, 400]
foo

In [10]:
# make a new, 3x4 array from list of lists
bar = np.array([
    [5, 10, 15, 20],
    [25, 30, 35, 40],
    [45, 50, 55, 60]
])

# return the element at position (1,2)
bar[1, 2]

# return row 1 as a 1d array
bar[0]

# return row 1 as a 2d array
bar[0, None]

# return the 2nd and 3rd columns in rows 2-3
bar[1:3, [-2, -1]]

# replace element (0, 0) with -1
bar[0, 0] = -1

# replace the 2nd row using the 3rd row
bar[1] = bar[2]

# insert 0s on diagonal
bar[[0, 1, 2], [0, 1, 2]] = [0, 0, 0]
bar

array([[ 0, 10, 15, 20],
       [45,  0, 55, 60],
       [45, 50,  0, 60]])

In [11]:
# make 2x2 arrays
foo = np.array([[4,3], [1,0]])
bar = np.array([[1,2], [3,4]])

# addition
foo + bar

# subtraction
foo - bar

# multiplication
foo * bar

# division
foo / bar

# matrix multiplication, you can use the @ operator
foo @ bar

array([[13, 20],
       [ 1,  2]])

In [13]:
#  4x3 array
bart = np.array([
    [1, 1, 1],
    [2, 2, 2],
    [3, 3, 3],
    [4, 4, 4]
])

# add 5 to the 1st column, 3 to the 2nd column and 10 to the 3rd column
lisa = np.array([
    [5, 3, 10],
    [5, 3, 10],
    [5, 3, 10],
    [5, 3, 10]
])
bart + lisa

# Use broadcasting v1
bart + np.array([[5, 3, 10]])

# Use broadcasting v2
bart +  np.array([5, 3, 10])

# shift and scale array
np.array([1,2,3]) + 0.5  # [1.5, 2.5, 3.5]
np.array([1,2,3]) * -1   # [-1, -2, -3]

# try adding bart to a 4 element vector
#bart + np.array([0, 0, 0, 0])  # error

## So how does broadcasting work and when can we use it?
## Suppose we want to add or multiply two arrays, A and B
## Moving backwards from the last dimension of each array, we check if their dimensions are compatible
## Dimensions are compatible they are equal or either of them is 1
## If all of A's dimensions are compatible with B's dimensions, or vice versa, they are compatible arrays

### Examples

# Example 1
np.random.seed(1234)
A = np.random.randint(low = 1, high = 10, size = (3, 4))
B = np.random.randint(low = 1, high = 10, size = (3, 1))

A.shape  # (3, 4)
B.shape  # (3, 1)
#           ^  ^
#         compatible

# Example 2
np.random.seed(4321)
A = np.random.randint(low = 1, high = 10, size = (4, 4))
B = np.random.randint(low = 1, high = 10, size = (2, 1))

A.shape  # (4, 4)
B.shape  # (2, 1)
#           ^  ^
#         not compatible


# Example 3
np.random.seed(1111)
A = np.random.randint(low = 1, high = 10, size = (3, 1, 4))
B = np.random.randint(low = 1, high = 10, size = (2, 1))

A.shape  # (3, 1, 4)
B.shape  # (   2, 1)
#           ^  ^  ^
#         compatible

(2, 1)

In [14]:
# make 1d arrays
A = np.array([3, 11, 4, 5])
B = np.array([5, 0, 3])

# Deduct each element of B from each element of A
A[np.newaxis, :] - B[:, np.newaxis]

# Same as above, using None
A[None, :] - B[:, None]

array([[-2,  6, -1,  0],
       [ 3, 11,  4,  5],
       [ 0,  8,  1,  2]])

In [17]:
# Suppose we have the following 1d array with 8 elements
foo = np.arange(start = 1, stop = 9)  # [1, 2, 3, 4, 5, 6, 7, 8]

# reshape into a 2x4 array using either the .reshape method of the array object, or the free function np.reshape()
bar = foo.reshape((2,4))
bar = np.reshape(a = foo, newshape = (2,4))

# slightly different interfaces
foo.reshape(2,4)  # allowed
#foo.reshape(newshape = (2,4))  # error

## reshape bar from a 2x4 array to a 4x2 array
# C-style order reorders the last axis first
bar.reshape((4,2), order = 'C')

# Fortran-style order reorders the first axis first
bar.reshape((4,2), order = 'F')

# matrix transpose of bar
bar.T

# reshape foo into higher dimensions, like a 2x2x2 array
bar.reshape((2,2,2))

# reshape foo into 2x2x3 array
#bar.reshape((2,2,3)) # error

# use -1 for exactly one of the newshape dimensions and numpy will calculate it for you
bar.reshape((-1, 2))

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [18]:
# 3x3 array of positive integers. we want tocreplace every 3 with 0
foo = np.array([
    [3, 9, 7],
    [2, 0, 3],
    [3, 3, 1]
])

# Checking foo == 3, numpy gives us a 3x3 array of boolean values
mask = foo == 3

# Now we can use this array of boolean values to index our original array, identify which elements are 3
foo[mask]
foo[mask] = 0
foo

# use 1d boolean arrays to pick out specific rows or columns
rows_1_and_3 = np.array([True, False, True])
cols_2_and_3 = np.array([False, True, True])
foo[rows_1_and_3]  # returns rows 1 and 3
foo[:, cols_2_and_3]  # returns cols 2 and 3

## Be careful about using multiple boolean indexes together
foo[rows_1_and_3, cols_2_and_3] # equivalent to foo[[0,2], [1,2]]

# --- logical operators ---------------------------------------

# combine boolean arrays using bitwise operators
b1 = np.array([False, False, True, True])
b2 = np.array([False, True, False, True])
b1 & b2  # [False, False, False,  True]  # and
b1 | b2  # [False,  True,  True,  True]  # or
b1 ^ b2  # [False,  True,  True, False]  # xor

# negation
~np.array([False, True])

# Examples
names = np.array(["Dennis", "Dee", "Charlie", "Mac", "Frank"])
ages = np.array([43, 44, 43, 42, 74])
genders = np.array(['male', 'female', 'male', 'male', 'male'])

# Who's at least 44?
names[ages >= 44]

# Which males are over 42?
names[(genders == "male") & (ages > 42)]

# Who's a not a male or younger than 43?
names[~(genders == "male") | (ages < 43)]

array(['Dee', 'Mac'], dtype='<U7')

In [20]:
# bot, 2 missing values
bot = np.ones(shape = (3, 4))
bot[[0, 2], [1, 2]] = np.nan
bot

# check bot == np.nan
bot == np.nan

# Be careful
np.nan == np.nan  # False
np.nan != np.nan  # True


# which elements of bot are nan?
np.isnan(bot)

# only works for for arrays of floats
foo = np.array([1, 2, 3], dtype = 'int64')
#foo[1] = np.nan  # error
foo

array([1, 2, 3], dtype=int64)

In [21]:
# np.inf and np.NINF
np.array([np.inf, np.NINF])  # [ inf, -inf]

# more commonly, these values occur when you divide by 0
np.array([-1, 1])/0  # [-inf,  inf]

# behaviors
np.inf * 22  # inf
np.inf + np.inf # inf
np.inf - np.inf  # nan
np.inf / np.inf    # nan

# positive infinity equals positive infinity and negative infinity equals negative infinity
np.inf == np.inf   # True
np.NINF == np.NINF # True

# isolate infinite values by checking == positive infinity or == negative infinity
foo = np.array([4.4, np.inf, 1.0, np.NINF, 3.1, np.inf])
foo == np.inf  # [False,  True, False, False, False,  True]
foo == np.NINF # [False, False, False,  True, False, False]

# Alternatively,
np.isposinf(foo)  # [False,  True, False, False, False,  True]
np.isneginf(foo)  # [False, False, False,  True, False, False]
np.isinf(foo)     # [False,  True, False,  True, False,  True]

  """


array([False,  True, False,  True, False,  True])

In [22]:
# simulate rolling a 6-sided die 3 times
np.random.randint(low=1, high=7, size=3)  # [1, 3, 2]

# set a random seed to get reproducible results
np.random.seed(2357)
np.random.randint(low=1, high=7, size=3)  # [2, 4, 1]
np.random.randint(low=1, high=7, size=3)  # [3, 1, 6]
np.random.seed(2357)
np.random.randint(low=1, high=7, size=3)  # [2, 4, 1]
np.random.randint(low=1, high=7, size=3)  # [3, 1, 6]

# draw three values between 1 and 6 without relpacement?
np.random.seed(2357)
np.random.choice(
    a = np.arange(1, 7),
    size = 3,
    replace = False,
    p = None
)  # [6, 5, 1]

# draw three values between 1 and 6 with probabilities
np.random.choice(
    a = np.arange(1, 7),
    size = 3,
    replace = False,
    p = np.array([0.1, 0.1, 0.1, 0.1, 0.3, 0.3])
)  # [5, 2, 6]

# draw three values from an array of strings
np.random.choice(
    a = np.array(['you', 'can', 'use', 'strings', 'too']),
    size = 3,
    replace = False,
    p = None
)  # ['use', 'you', 'can']

## Sample rows from a 2d array
foo = np.array([
    [1, 2],
    [3, 4],
    [5, 6],
    [7, 8],
    [9, 10]
])

# with replacement
np.random.seed(1234)
rand_rows = np.random.randint(
    low=0,
    high=foo.shape[0],
    size=3
)  # [3, 4, 4]
foo[rand_rows]

# without replacement
rand_rows = np.random.choice(
    a=np.arange(start=0, stop=foo.shape[0]),
    replace=False,
    size=3
)  # [4, 2, 3]
foo[rand_rows]

# row-wise shuffle
np.random.permutation(foo)

## uniform() will give you values from a uniform distribution
np.random.uniform(low = 1.0, high = 2.0, size = (2, 2))

## normal() from a normal distribution
np.random.normal(loc = 0.0, scale = 1.0, size = 2)

## binomial() from a binomial distribution
np.random.binomial(n = 10, p = 0.25, size = (3, 2))

## If you want to see more distributions, check out
## https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.random.html


array([[4, 1],
       [0, 2],
       [0, 2]])

In [23]:
# make two 1d arrays, each with length 5
foo = np.array([1, 2, 3, 4, 5])
bar = np.array([0, 1, 0, 0, 1])

# create a third array called baz such that, where bar is 0, you double the corresponding value
# of foo, otherwise, you take half the corresponding value of foo. You might be inclined to do this with a for loop
baz = np.zeros(foo.shape[0])
for i in range(foo.shape[0]):
    if bar[i] == 0:
        baz[i] = 2 * foo[i]
    else:
        baz[i] = foo[i] / 2

# Use where() (vectorized solution)
baz = np.where((bar == 0), foo * 2.0, foo / 2.0)

In [24]:
import numpy as np

squee = np.array(
    [[5.0, 2.0, 9.0],
     [1.0, 0.0, 2.0],
     [1.0, 7.0, 8.0]]
)

# sum
np.sum(squee)  # 35.0

# sum over rows, columns
np.sum(squee, axis = 0)  # sum across axis 0 (column sums)
np.sum(squee, axis = 1)  # sum across axis 1 (row sums)

# keepdims
np.sum(squee, axis = 0, keepdims=True)  # [[ 7.,  9., 19.]]

# sum with nans
squee[0, 0] = np.nan
np.sum(squee)  # sum across axis 1 (row sums) nan

# sum excluding nans
np.sum(squee, where = ~np.isnan(squee))  # 30.0

# or..
np.nansum(squee)  # 30.0

30.0

In [25]:
foo = np.array([
    [np.nan, 4.4],
    [1.0, 3.2],
    [np.nan, np.nan],
    [0.1, np.nan]
])

# which rows have at least one nan value
mask = np.any(np.isnan(foo), axis = 1)
foo[mask]

# which rows have all nan values
mask = np.all(np.isnan(foo), axis = 1)
foo[mask]

array([[nan, nan]])

In [27]:
roux = np.zeros(shape = (3, 2))
gumbo = np.ones(shape = (2,2))

# combine roux with a couple copies of itself row-wise
np.concatenate((roux, roux, roux), axis = 0)

# column-wise
np.concatenate((roux, roux, roux), axis = 1)

# combine roux and gumbo row-wise
np.concatenate((roux, gumbo), axis = 0)

# combine them column-wise
#np.concatenate((roux, gumbo), axis = 1)  # error

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 1.],
       [1., 1.]])

In [29]:
foo = np.array(['a', 'b'])
bar = np.array(['c', 'd'])
baz = np.array([['e', 'f']])
bingo = np.array([['g', 'h', 'i']])
bongo = np.array(
    [['j', 'k'],
     ['l', 'm']]
)

# vstack foo and bar
np.vstack((foo, bar))

# vstack foo, bar, and baz
np.vstack((foo, bar, baz))

# vstack baz and bingo
#np.vstack((baz, bingo))  # error

# hstack foo and bar
np.hstack((foo, bar))

# hstack baz and bingo
np.hstack((baz, bingo))

# hstack foo and bingo
#np.hstack((foo, bingo))  # error

# hstack bingo and bongo
#np.hstack((bingo, bongo))  # error

# stack foo and bar along axis 0
np.stack((foo, bar), axis = 0)

# stack foo and bar along axis 1
np.stack((foo, bar), axis = 1)

# stack foo and bar along axis 2
#np.stack((foo, bar), axis = 2)  # error

# Use the shortcut axis = -1 to insert the new axis behind the last existing axis
np.stack((foo, bar), axis = -1)

array([['a', 'c'],
       ['b', 'd']], dtype='<U1')

In [30]:
# sort a 1d array in ascending order
foo = np.array([1, 7, 3, 9, 0, 9, 1])
np.sort(foo)

# If you have an array with nan values, sort pushes them to the end of the array
bar = np.array([5, np.nan, 3, 11])
np.sort(bar)  # [ 3.,  5., 11., nan]

# sort an array in descending order
np.sort(bar)[::-1]  # reverse the sorted array
-np.sort(-bar)      # negate the sorted values of the negated array

# If you need a stable sorting algorithm, set kind = 'stable'
np.sort(np.array([2, 1, 3, 2]), kind='stable')

# sort on a 2d array
boo = np.array([
    [10, 55, 12],
    [0, 81, 33],
    [92, 11, 3]
])
np.sort(a = boo, axis = 0)   # sorts along the row axis
np.sort(a = boo, axis = 1)   # sorts along the column axis
np.sort(a = boo, axis = -1)  # (default) sorts along the last axis (in this case, the column axis)

# sort the rows of foo based on the values in the 1st column
boo[np.array([1, 0, 2])]

# argsort()
goo = np.array([3, 0, 10, 5])  # [3, 0, 10, 5]
np.argsort(goo)                # [1, 0,  3,  2]
np.sort(goo)                   # [0, 3,  5  10]

# sort a 2d array's rows based on a certain column
boo[np.argsort(boo[:, 1])]   # sort by column 1 ascending
boo[np.argsort(-boo[:, -1])]  # sort by last column, descending

array([[ 0, 81, 33],
       [10, 55, 12],
       [92, 11,  3]])

In [31]:
foo = np.array(['b', 'b', 'a', 'a', 'c', 'c'])

# get uniques
np.unique(foo)   # ['a', 'b', 'c', 'nan']

# get uniques sorted by first occurrence
uniques, first_positions = np.unique(foo, return_index = True)
uniques[np.argsort(first_positions)]  # ['b', 'a', 'c']

# get uniques with counts
np.unique(foo, return_counts=True)

(array(['a', 'b', 'c'], dtype='<U1'), array([2, 2, 2], dtype=int64))

In [38]:
## Given a 10x2 array of floats where the 1st column contains some nan values,
## create a 3rd column equal to column 1 where it's not nan and column 2 where it is nan.
## In other words, set column 3 equal to column 1, but fall back on column 2 where column 1 has a missing value.

# Setup
import numpy as np
np.random.seed(123)
foo = np.random.uniform(low = 0.0, high = 1.0, size = (10, 2))
foo[np.random.randint(low = 0, high = 10, size = 5), np.repeat(0, 5)] = np.nan
foo = np.round(foo, 2)

# Solution
newvals = np.where(np.isnan(foo[:, 0]), foo[:, 1], foo[:, 0])
result = np.insert(arr = foo, obj = 2, values = newvals, axis = 1)

# Or
result = np.hstack((foo, newvals[:, None]))


a = np.array([[1, 1], [2, 2], [3, 3]])
a

array([[1, 1],
       [2, 2],
       [3, 3]])

In [36]:
# Given a 1d array of integers, identify the first three values < 10 and replace them with 0.

# Setup
import numpy as np
moo = np.array([0, 15, 32, 11, 5, 5, 24, 99, 81, 3, 45, 9, 41])

# Solution
moo[(moo < 10).nonzero()[0][:3]] = 0
moo

array([ 0, 15, 32, 11,  0,  0, 24, 99, 81,  3, 45,  9, 41])

In [34]:
# Insert 10 random normal values into a 5x5 array of 0s at random locations.

# Setup
import numpy as np
oof = np.zeros(shape = (5, 5))

# Solution
np.random.seed(1234)
vals = np.random.normal(size = 10)
locs = np.random.choice(oof.size, len(vals), replace = False)
oof.ravel()[locs] = vals

np.round(oof, 3)

array([[ 0.   , -0.313,  0.   ,  0.   , -0.721],
       [ 0.   ,  0.   ,  0.   ,  0.471,  0.   ],
       [ 0.   ,  0.   ,  0.   , -0.637,  0.016],
       [ 0.   ,  0.   , -1.191,  0.   ,  0.86 ],
       [ 0.887,  1.433, -2.243,  0.   ,  0.   ]])

In [39]:
# Given peanut, a 4x5 array of 0s, and butter, a 5-element array of indices, fill
# the rows of peanut with 1s starting from the column indices given by butter.

# Setup
import numpy as np
peanut = np.zeros(shape = (4, 5))
butter = np.array([3, 0, 4, 1])

# Solution
(butter[:, None] <= np.arange(peanut.shape[1])).astype('int')

array([[0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 1, 1]])

In [41]:
# Given an array of integers, one hot encode it into a 2d array.

# Setup

yoyoyo = np.array([3, 1, 0, 1])

# Solution 1
num_classes = np.max(yoyoyo) + 1
result = np.eye(num_classes)[yoyoyo]

# Solution 2
result = np.zeros(shape = (len(yoyoyo), num_classes))
result[np.arange(result.shape[0]), yoyoyo] = 1
result


array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]])

In [42]:
# Given peanut, a 4x5 array of 0s, and butter, a 5-element array of indices, fill
# the rows of peanut with 1s starting from the column indices given by butter.

# Setup

peanut = np.zeros(shape = (4, 5))
butter = np.array([3, 0, 4, 1])

# Solution
(butter[:, None] <= np.arange(peanut.shape[1])).astype('int')

array([[0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 1, 1]])