NumPy is one of the two most important libraries in Python for data science, along with pandas. NumPy is a crucial library for effectively loading, storing, and manipulating in-memory data in Python. All these tasks will be at the heart of what you do with data science in Python.



In [None]:
# import numbpy: 
import numpy as np

In [13]:
# Numby Arrays: 
import numpy as np
np.array([1, 4, 2, 5, 3]) 

np.array([3.14, 'pi', 3])

np.array([1, 2, 3, 4], dtype='float32')

# Multidimimensional Array: 

np.array([range(i, i + 3) for i in [2, 4, 6]])

# Create an integer array of length 10 that's filled with zeros:
np.zeros(10, dtype=int)

# Create a 3 x 5 floating-point array that's filled with ones:
np.ones((3, 5), dtype=float)

# Create a 3 x 5 array that's filled with 3.14s. The first number in the tuple gives the number of rows. The second number in the tuple sets the number of columns:

np.full((3, 5), 3.14)

# Create a 3 x 3 array of uniformly distributed random values between 0 and 1:
np.random.random((3, 3))

# Create a 3 x 3 array of normally distributed random values with mean 0 and standard deviation 1:
np.random.normal(0, 1, (3, 3))

#Create a 3 x 3 array of random integers in the interval [0, 10):
np.random.randint(0, 10, (3, 3))

# Create a 3 x 3 identity matrix:
np.eye(3)


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [16]:
# Array Attributes:
import numpy as np
np.random.seed(0)  # Seed for reproducibility

a1 = np.random.randint(10, size=6)  # One-dimensional array
a2 = np.random.randint(10, size=(3, 4))  # Two-dimensional array
a3 = np.random.randint(10, size=(3, 4, 5))  # Three-dimensional array

print("a1 ndim: ", a1.ndim)
print("a1 shape:", a1.shape)
print("a1 size: ", a1.size)
print("dtype:", a3.dtype)


a1 ndim:  1
a1 shape: (6,)
a1 size:  6
dtype: int32


In [28]:
# Slicing arrays: 
# a[start:stop:step]
a = np.arange(10)
a
a[:5]
a[5:]
a[4:7]
a[::2]
a[1::2]
a[5::-2]
a2
a2[:2, :3]
a2[:3, ::2]
a2[::-1, ::-1]

array([[7, 7, 6, 1],
       [8, 8, 6, 7],
       [4, 2, 5, 3]])

In [32]:
# Accessing array rows and columns: 
print(a2[:, 0])
print(a2[0, :])
print(a2[0])  # Equivalent to a2[0, :]
a3[:,:,2]

[3 7 1]
[3 5 2 4]
[3 5 2 4]


array([[5, 3, 2, 3],
       [9, 3, 0, 8],
       [8, 9, 0, 4]])

In [35]:
# Copying arrays: 
a2_sub_copy = a2[:2, :2].copy()
print(a2_sub_copy)

a2_sub_copy[0, 0] = 42
print(a2_sub_copy)

print(a2)

[[3 5]
 [7 6]]
[[42  5]
 [ 7  6]]
[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]


In [40]:
# Reshaping arrays: 
grid = np.arange(1, 10).reshape((3, 3))
print(grid)

a = np.array([1, 2, 3])
a.reshape((1, 3))

a[np.newaxis, :]

a.reshape((3, 1))

a[:, np.newaxis]

[[1 2 3]
 [4 5 6]
 [7 8 9]]


array([[1],
       [2],
       [3]])

In [45]:
# Joining and splitting arrays: 

a = np.array([1, 2, 3])
b = np.array([3, 2, 1])
np.concatenate([a, b])

c = [99, 99, 99]
print(np.concatenate([a, b, c]))

grid = np.array([[1, 2, 3],
                 [4, 5, 6]])

np.concatenate([grid, grid])

np.concatenate([grid, grid], axis=1)

[ 1  2  3  3  2  1 99 99 99]


array([[1, 2, 3, 1, 2, 3],
       [4, 5, 6, 4, 5, 6]])

In [49]:
# Splitting arrays: 
a = [1, 2, 3, 99, 99, 3, 2, 1]
a1, a2, a3 = np.split(a, [3, 5])
print(a1, a2, a3)

grid = np.arange(16).reshape((4, 4))
grid

np.split(grid, [1, 2])

np.split(grid, [1, 2], axis=1)

[1 2 3] [99 99] [3 2 1]


[array([[ 0],
        [ 4],
        [ 8],
        [12]]),
 array([[ 1],
        [ 5],
        [ 9],
        [13]]),
 array([[ 2,  3],
        [ 6,  7],
        [10, 11],
        [14, 15]])]

In [62]:
# Fancy Indexing: 
rand = np.random.RandomState(42)

arr = rand.randint(100, size=10)
print(arr)

[arr[3], arr[7], arr[2]]

ind = [3, 7, 4]
arr[ind]


ind = np.array([[3, 7],
                [4, 5]])
arr[ind]


arr2 = np.arange(12).reshape((3, 4))
arr2

row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
arr2[row, col]

arr2[row[:, np.newaxis], col]

row[:, np.newaxis] * col

row[:, np.newaxis] * row

col[:, np.newaxis] * row

[51 92 14 71 60 20 82 86 74 74]


array([[0, 2, 4],
       [0, 1, 2],
       [0, 3, 6]])

In [65]:
# Combined indexing: 
print(arr2)

arr2[1:, [2, 0, 1]]

mask = np.array([1, 0, 1, 0], dtype=bool)
arr2[row[:, np.newaxis], mask]



[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([[ 0,  2],
       [ 4,  6],
       [ 8, 10]])

In [70]:
# Modifying values by using fancy indexing: 
ind = np.arange(10)
arr = np.array([2, 1, 8, 4])
ind[arr] = 99
print(ind)

ind[arr] -= 10
print(ind)

ind = np.zeros(10)
ind[[0, 0]] = [4, 6]
print(ind)

ind = np.zeros(10)
np.add.at(ind, arr, 1)
print(ind)

np.subtract.at(ind, arr, 1)
print(ind)

[ 0 99 99  3 99  5  6  7 99  9]
[ 0 89 89  3 89  5  6  7 89  9]
[6. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 1. 0. 1. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [76]:
# Sorting arrays: 
a = np.array([2, 1, 4, 3, 5])
np.sort(a)

a.sort()
print(a)

a = np.array([2, 1, 4, 3, 5])
b = np.argsort(a)
print(b)

a[b]

rand = np.random.RandomState(42)
table = rand.randint(0, 10, (4, 6))
print(table)

np.sort(table, axis=0)

[1 2 3 4 5]
[1 0 3 2 4]
[[6 3 7 4 6 9]
 [2 6 7 4 3 7]
 [7 2 5 4 1 7]
 [5 1 4 0 9 5]]


array([[2, 1, 4, 0, 1, 5],
       [5, 2, 5, 4, 3, 7],
       [6, 3, 7, 4, 6, 7],
       [7, 6, 7, 4, 9, 9]])

In [78]:
# Partial Sorting Prtiioning: 
arr = np.array([7, 2, 3, 1, 6, 5, 4])
np.partition(arr, 3)

np.partition(table, 2, axis=1)

array([[3, 4, 6, 7, 6, 9],
       [2, 3, 4, 7, 6, 7],
       [1, 2, 4, 5, 7, 7],
       [0, 1, 4, 5, 9, 5]])

In [80]:
# Efficient computation on NumPy arrays: Universal functions: 

import numpy as np
np.random.seed(0)

def compute_reciprocals(values):
    output = np.empty(len(values))
    for i in range(len(values)):
        output[i] = 1.0 / values[i]
    return output

values = np.random.randint(1, 10, size=5)
compute_reciprocals(values)

big_array = np.random.randint(1, 100, size=1000000)
%timeit compute_reciprocals(big_array)

1.44 s ± 43.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [81]:
# Ufuncs: 
%timeit (1.0 / big_array)

3.08 ms ± 246 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [91]:
# Array arithmetic: 
a = np.arange(4)
print("a     =", a)
print("a + 5 =", a + 5)
print("a - 5 =", a - 5)
print("a * 2 =", a * 2)
print("a / 2 =", a / 2)
print("a // 2 =", a // 2)  # Floor division

print("-a     = ", -a)
print("a ** 2 = ", a ** 2)
print("a % 2  = ", a % 2)

-(0.5*a + 1) ** 2

np.add(a, 2)

# Absolute Value:
a = np.array([-2, -1, 0, 1, 2])
abs(a)

np.absolute(a)
np.abs(a)

# Exponents and logarithms: 
a = [1, 2, 3]
print("a     =", a)
print("e^a   =", np.exp(a))
print("2^a   =", np.exp2(a))
print("3^a   =", np.power(3, a))

a = [1, 2, 4, 10]
print("a        =", a)
print("ln(a)    =", np.log(a))
print("log2(a)  =", np.log2(a))
print("log10(a) =", np.log10(a))


a     = [0 1 2 3]
a + 5 = [5 6 7 8]
a - 5 = [-5 -4 -3 -2]
a * 2 = [0 2 4 6]
a / 2 = [0.  0.5 1.  1.5]
a // 2 = [0 0 1 1]
-a     =  [ 0 -1 -2 -3]
a ** 2 =  [0 1 4 9]
a % 2  =  [0 1 0 1]
a     = [1, 2, 3]
e^a   = [ 2.71828183  7.3890561  20.08553692]
2^a   = [2. 4. 8.]
3^a   = [ 3  9 27]
a        = [1, 2, 4, 10]
ln(a)    = [0.         0.69314718 1.38629436 2.30258509]
log2(a)  = [0.         1.         2.         3.32192809]
log10(a) = [0.         0.30103    0.60205999 1.        ]


In [92]:
# Specialized ufuncs: 
from scipy import special

a = [1, 5, 10]
print("gamma(a)     =", special.gamma(a))
print("ln|gamma(a)| =", special.gammaln(a))
print("beta(a, 2)   =", special.beta(a, 2))

gamma(a)     = [1.0000e+00 2.4000e+01 3.6288e+05]
ln|gamma(a)| = [ 0.          3.17805383 12.80182748]
beta(a, 2)   = [0.5        0.03333333 0.00909091]


In [95]:
# Summing the values of an array: 
myList = np.random.random(100)
sum(myList)
np.sum(myList)

large_array = np.random.rand(1000000)
%timeit sum(large_array)
%timeit np.sum(large_array)


72 ms ± 3.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.13 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [97]:
# Minimum and maximum: 
np.min(large_array), np.max(large_array)

print(large_array.min(), large_array.max(), large_array.sum())

7.071203171893359e-07 0.9999997207656334 500213.2396620991


In [100]:
# Multidimensional aggregates: 

md = np.random.random((3, 4))
print(md)

md.sum()

md.min(axis=0)

[[0.02461419 0.88905646 0.51051681 0.93157766]
 [0.72024497 0.16963688 0.48246213 0.34246825]
 [0.59118864 0.10465831 0.01258607 0.74123982]]


array([0.02461419, 0.10465831, 0.01258607, 0.34246825])

In [108]:
# Computation on arrays by using broadcasting: 
first_array = np.array([3, 6, 8, 1])
second_array = np.array([4, 5, 7, 2])
first_array + second_array

first_array + 5

one_dim_array = np.ones((1))
one_dim_array

two_dim_array = np.ones((2, 2))
two_dim_array

one_dim_array + two_dim_array

horizontal_array = np.arange(3)
vertical_array = np.arange(3)[:, np.newaxis]

print(horizontal_array)
print(vertical_array)

horizontal_array + vertical_array

[0 1 2]
[[0]
 [1]
 [2]]


array([[0, 1, 2],
       [1, 2, 3],
       [2, 3, 4]])

In [120]:
# Rules of broadcasting: 

two_dim_array = np.ones((2, 3))
one_dim_array = np.arange(3)

two_dim_array + one_dim_array

two_dim_array = np.ones((3, 2))
two_dim_array

vertical_array = np.arange(3).reshape((3, 1))
horizontal_array = np.arange(3)

vertical_array + horizontal_array

T = np.random.random((10, 3))
T

Tmean = T.mean(0)
Tmean

T_centered = T - Tmean
T_centered

array([[ 3.98864045e-01, -2.83863392e-01,  4.34471313e-02],
       [-1.51660845e-01, -3.85529547e-01,  1.75398818e-01],
       [-1.21862907e-01, -2.73706075e-01,  2.22446952e-01],
       [-5.58931896e-01,  4.33692111e-01, -8.91574349e-02],
       [ 1.20837670e-01,  2.73739639e-01, -1.19951342e-01],
       [-1.61932656e-01,  1.10662434e-01, -1.69278155e-01],
       [-4.38008481e-02,  3.13830437e-01, -3.54582935e-04],
       [-3.85545836e-02, -3.22704199e-02,  2.24107463e-02],
       [ 3.34562322e-01, -2.54244881e-01,  2.00460151e-01],
       [ 2.22479699e-01,  9.76896949e-02, -2.85422285e-01]])

In [133]:
# Comparisons, masks, and Boolean logic in NumPy: 
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

# Comparison operators as ufuncs: 
simple_array = np.array([1, 2, 3, 4, 5])
simple_array < 2
simple_array >= 4
simple_array == 2
(2 * simple_array) == (simple_array ** 2)

rand = np.random.RandomState(0)
two_dim_array = rand.randint(10, size=(3, 4))
two_dim_array
two_dim_array < 6

array([[ True,  True,  True,  True],
       [False, False,  True,  True],
       [ True,  True, False, False]])

In [143]:
# Working with Boolean arrays: 
print(two_dim_array)

# Counting Entries:
np.count_nonzero(two_dim_array < 6)

np.sum(two_dim_array < 5, axis=1)

np.any(two_dim_array < 0)

np.all(two_dim_array < 10)

np.all(two_dim_array < 7, axis=1)



[[5 0 3 3]
 [7 9 3 5]
 [2 4 7 6]]


array([ True, False, False])

In [145]:
# Boolean arrays as masks: 
two_dim_array
two_dim_array < 5

array([[False,  True,  True,  True],
       [False, False,  True, False],
       [ True,  True, False, False]])