In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as graph
import seaborn as sns

%load_ext line_profiler
%load_ext memory_profiler

In [2]:
# Instead of nested for loops

for i in range(1000):
    for j in range(1000):
        total = i+j

In [3]:
def sum_of_lists(N):
    total = 0
    for i in range(5):
        L = [j ^ (j >> i) for j in range(N)]
        total += sum(L)
    return total

In [4]:
%memit sum_of_lists(10)

peak memory: 112.75 MiB, increment: 0.18 MiB


# Introduction to Numpy

### Lists

In [6]:
L = list(range(10))
print(L)
print(type(L))
print(f'One item in L is of type {type(L[8])}')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
<class 'list'>
One item in L is of type <class 'int'>


In [7]:
# List of strings

c = [str(c) for c in L]
print(c)
print(type(c[1]))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
<class 'str'>


In [8]:
# List can be heterogenous

L = [1, 1.5, 'one', True]
print([type(c) for c in L])

[<class 'int'>, <class 'float'>, <class 'str'>, <class 'bool'>]


In [9]:
# Python fixed type arrays
import array

L = list(range(10))
a_L = array.array('i', L)
print(L, a_L)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [10]:
# Python fixed type arrays

int_array = np.array(range(10))
float_array = np.array([1.5, 2, 3, 4, 5])
cast_float_array = np.array(range(10), dtype=float)

print(int_array)
print(float_array)
print(cast_float_array)

[0 1 2 3 4 5 6 7 8 9]
[1.5 2.  3.  4.  5. ]
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]


In [11]:
# nested numpy lists -> multi dimensional

print(np.array([range(i, i+3) for i in [1, 2, 3, 4]]))

[[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 6]]


#### Create Arrays from Scratch

In [12]:
# create array of zeros
print(np.zeros(10, dtype=int))

# create array of ones, with dimensions 2 * 5
print(np.ones((2, 5), dtype=float))

# create array of 5, with dimensions 3*5
print(np.full((3, 5), 5))

# Create an array filled with a linear sequence
# Starting at 0, ending at 20, stepping by 2
print(np.arange(0, 21, 2))

# Create an array of five values evenly spaced between 0 and 1
print(np.linspace(0, 1, 5))

# Create a 3x3 array of uniformly distributed
# random values between 0 and 1
print(np.random.random((3, 3)))

# Create a 3x3 array of normally distributed random values
# with mean 0 and standard deviation 1
print(np.random.normal(0, 1, (3, 3)))

# Create a 3x3 array of random integers in the interval [0, 10)
print(np.random.randint(0, 10, (3,3)))

# Create a 3x3 identity matrix
np.eye(3)

# Create an uninitialized array of three integers
# The values will be whatever happens to already exist at that memory location
print(np.empty(4))

[0 0 0 0 0 0 0 0 0 0]
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
[[5 5 5 5 5]
 [5 5 5 5 5]
 [5 5 5 5 5]]
[ 0  2  4  6  8 10 12 14 16 18 20]
[0.   0.25 0.5  0.75 1.  ]
[[0.33114472 0.60854915 0.84323891]
 [0.1789972  0.10915979 0.24864669]
 [0.43467617 0.70122308 0.60887088]]
[[-0.64566893  0.67805158  0.36495456]
 [ 0.64975784  1.67448538 -0.94365557]
 [-0.06268491  0.76371715 -0.38847591]]
[[3 7 7]
 [1 0 7]
 [0 6 8]]
[0.25 0.5  0.75 1.  ]


## Basics of Numpy Arrays

In [27]:
np.random.seed(0)

x1 = np.random.randint(10, size=6)
x2 = np.random.randint(10, size=(2, 3))
x3 = np.random.randint(10, size=(2, 3, 4, 5))

print(x1)
print(x2)

print(f"x3 ndim: {x3.ndim}")
print(f"x3 shape: {x3.shape}")
print(f"x3 size: {x3.size}")
print(f'x3 dtype: {x3.dtype}')
print(f'x3 itemsize: {x3.itemsize} bytes')
print(f'x3 nbytes: {x3.nbytes} bytes')

[5 0 3 3 7 9]
[[3 5 2]
 [4 7 6]]
x3 ndim: 4
x3 shape: (2, 3, 4, 5)
x3 size: 120
x3 dtype: int64
x3 itemsize: 8 bytes
x3 nbytes: 960 bytes


### Slicing

#### x[start:stop:step]

In [14]:
x2 = np.random.randint(10, size=(2, 3))
print(x2)

# get 2nd row, 3rd column item
print(x2[1, 2])

# change 1st row, 2nd col to 0
x2[0, 1] = 0
print(x2)

[[1 9 8]
 [9 3 8]]
8
[[1 0 8]
 [9 3 8]]


In [15]:
print(x3)
print(x3[:2, :2, :2])

NameError: name 'x3' is not defined

In [16]:
print(x2)
#first row
print(x2[0, :])

#first column
print(x2[:, 0])

[[1 0 8]
 [9 3 8]]
[1 0 8]
[1 9]


#### Copy array and change that, instead of subarray since this will change the original directly

In [17]:
# No copy Views

x2_2 = x2[:2, :2] # make a subarray of 2
print(x2_2)

# x2_2[0, 0] = 4
# print(x2) # The original is changed

# We can simply copy to avoid this change thing

x2_2 = x2[:2, :2].copy()
print(x2_2)

x2_2[0, 0] = 4
print(x2)

[[1 0]
 [9 3]]
[[1 0]
 [9 3]]
[[1 0 8]
 [9 3 8]]


### Reshaping

In [18]:
# Reshaping arrays

x_shaped = np.arange(9).reshape((3,3))
print(x_shaped)

x = np.array([1, 2, 3])
print(x, x.shape)

# row vector via reshape
x.reshape((3, 1))
print(x, x.shape)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[1 2 3] (3,)
[1 2 3] (3,)


In [19]:
x = np.array([1, 2, 3])
x.shape

(3,)

In [20]:
x.reshape((3, 1))

array([[1],
       [2],
       [3]])

### Concatenate arrays

In [21]:
# Concatenate arrays

x = np.array([1, 2, 3])
y = np.array([3, 2, 1])
z = np.array([1, 5, 6, 7,])
print(x, y ,z)
print(np.concatenate([x, y, z]))

grid = np.array([[1, 2, 3],
                 [4, 5, 6]])
print(grid, grid.shape)
print(np.concatenate([grid, grid]), np.concatenate([grid, grid]).shape)
print(np.concatenate([grid, grid], axis=1), np.concatenate([grid, grid], axis=1).shape)

# vstack and hstack
x = np.array([1, 2, 3])
grid = np.array([[9, 8, 7],
                 [6, 5, 4]])

# vertically stack the arrays
print(np.vstack([x, grid]), np.vstack([x, grid]).shape)

# horizontallystack the arrays
y = np.array([[99],
              [99]])
print(np.hstack([grid, y]), np.hstack([grid, y]).shape)

[1 2 3] [3 2 1] [1 5 6 7]
[1 2 3 3 2 1 1 5 6 7]
[[1 2 3]
 [4 5 6]] (2, 3)
[[1 2 3]
 [4 5 6]
 [1 2 3]
 [4 5 6]] (4, 3)
[[1 2 3 1 2 3]
 [4 5 6 4 5 6]] (2, 6)
[[1 2 3]
 [9 8 7]
 [6 5 4]] (3, 3)
[[ 9  8  7 99]
 [ 6  5  4 99]] (2, 4)


In [22]:
# Aarray ssplittin

x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3, x4 = np.split(x, [3, 5, 7])
print(x1, x2, x3, x4)

grid = np.arange(16).reshape((4, 4))
upper, lower = np.vsplit(grid, [2])
print('Vertical split: ', upper, lower)

left, right = np.hsplit(grid, [2])
print('Horizontal split: ', left, right)


[1 2 3] [99 99] [3 2] [1]
Vertical split:  [[0 1 2 3]
 [4 5 6 7]] [[ 8  9 10 11]
 [12 13 14 15]]
Horizontal split:  [[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]] [[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]


### Numpy Ufuncs

In [117]:
### Numpy UFuncs

x = np.arange(6)
print(f'x is {x}')
print("x // 2 =", x // 2)  # floor division
print("x % 2  = ", x % 2) # modulus

print(-(0.8*x + 4) ** 2) # can be in any order
print(f'add 2 is {np.add(2, x)}')
print(f'+ 2 is {2 + x}')

print(f'absolute of x is {np.abs(-x)}')

# trig functions

theta = np.linspace(0, np.pi, 3)

print("theta      = ", theta)
print("sin(theta) = ", np.around(np.sin(theta), decimals=2))
print("cos(theta) = ", np.cos(theta))
print("tan(theta) = ", np.tan(theta))

# Exp and logs
print("---------------------------LOGS------------------------------")
x = np.arange(5)

print(f'log x is {np.log(x)}')
print(f' 3 power x is {np.power(3, x)}')

x = [0, 0.001, 0.01, 0.1]
print("exp(x) - 1 =", np.expm1(x)) # Give more precise output
print("log(1 + x) =", np.log1p(x))

# Scipy special
from scipy import special
print('----------------------------SPECIALS------------------------')
x = [1, 5, 10]
print("gamma(x)     =", special.gamma(x))
print("ln|gamma(x)| =", special.gammaln(x))
print("beta(x, 2)   =", special.beta(x, 2))

x is [0 1 2 3 4 5]
x // 2 = [0 0 1 1 2 2]
x % 2  =  [0 1 0 1 0 1]
[-16.   -23.04 -31.36 -40.96 -51.84 -64.  ]
add 2 is [2 3 4 5 6 7]
+ 2 is [2 3 4 5 6 7]
absolute of x is [0 1 2 3 4 5]
theta      =  [0.         1.57079633 3.14159265]
sin(theta) =  [0. 1. 0.]
cos(theta) =  [ 1.000000e+00  6.123234e-17 -1.000000e+00]
tan(theta) =  [ 0.00000000e+00  1.63312394e+16 -1.22464680e-16]
---------------------------LOGS------------------------------
log x is [      -inf 0.         0.69314718 1.09861229 1.38629436]
 3 power x is [ 1  3  9 27 81]
exp(x) - 1 = [0.         0.0010005  0.01005017 0.10517092]
log(1 + x) = [0.         0.0009995  0.00995033 0.09531018]
----------------------------SPECIALS------------------------
gamma(x)     = [1.0000e+00 2.4000e+01 3.6288e+05]
ln|gamma(x)| = [ 0.          3.17805383 12.80182748]
beta(x, 2)   = [0.5        0.03333333 0.00909091]




### Specify outputs

In [24]:
# Specify outputs
print('----------------------------SPECify OUTPUT------------------------')
x = np.arange(5)
y = np.empty(5)
np.multiply(x, 10, out=y)
print(y)

y = np.zeros(10)
np.power(2, x, out=y[::2])
print(y)

----------------------------SPECify OUTPUT------------------------
[ 0. 10. 20. 30. 40.]
[ 1.  0.  2.  0.  4.  0.  8.  0. 16.  0.]


### Aggregates

Reduce (constantly apply a function until 1 answer) or accumulate (save all cumulative steps)

In [26]:
x = np.arange(1, 6)
print(f'reduce x multiply is {np.multiply.reduce(x)}')

x = np.arange(1, 6)
print(f'accumulate x multiply is {np.multiply.accumulate(x)}')

reduce x multiply is 120
accumulate x multiply is [  1   2   6  24 120]


### Outer (I do not understand this)

In [27]:
x = np.arange(1, 6)
np.multiply.outer(x, x)

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

## Aggregations: Min, max, and stuff

In [28]:
L = np.random.random(100)
print(np.sum(L))

53.52104007807241


### Multi dimensional aggregates

In [29]:
M = np.random.random((3, 4))
print(M)

[[0.91213615 0.34415119 0.0925098  0.36471567]
 [0.35877138 0.84124098 0.90499368 0.86209355]
 [0.54925    0.7166621  0.65089007 0.35414668]]


In [32]:
print(f'SUm is {M.sum()}')
print(f'SUm along rows is {M.sum(axis=1)}')
print(f'SUm along columns is {M.sum(axis=0)}')

SUm is 6.95156123639825
SUm along rows is [1.7135128  2.96709959 2.27094885]
SUm along columns is [1.82015753 1.90205426 1.64839355 1.5809559 ]


### find index of lowest value

In [34]:
print(f'Using argmin on {x} gives index {np.argmin(x)}')

Using argmin on [1 2 3 4 5] gives index 0


## Broadcasting
Broadcasting is simply a set of rules for applying binary ufuncs (e.g., addition, subtraction, multiplication, etc.) on arrays of different sizes.

In [37]:
a = np.arange(3)
M = np.ones((3,3))
print(a)
print(M)
print(a+M)

[0 1 2]
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]


In [39]:
a = np.arange(3)
b = np.arange(3).reshape(3, 1)

print(a)
print(b)
print(a+b)

[0 1 2]
[[0]
 [1]
 [2]]
[[0 1 2]
 [1 2 3]
 [2 3 4]]


### Centering an Array

In [45]:
x = np.random.random((10, 3))
print(x)

x_mean = x.mean(0)
print(x_mean)
x_centered = x - x_mean
print(x_centered)
print(f'X_centered mean should be nearly 0, and it is {x_centered.mean(axis=0)}')

[[0.13565189 0.04580803 0.60484788]
 [0.10773567 0.03535866 0.32921621]
 [0.73899123 0.80816388 0.60012741]
 [0.66776563 0.34676264 0.1526194 ]
 [0.64621762 0.65807256 0.55944322]
 [0.69638692 0.14528293 0.32463683]
 [0.66800012 0.91775211 0.53821254]
 [0.84053799 0.03495237 0.74993255]
 [0.09077541 0.58247552 0.76964404]
 [0.79261991 0.24150182 0.7117076 ]]
[0.53846824 0.38161305 0.53403877]
[[-0.40281635 -0.33580502  0.07080911]
 [-0.43073257 -0.3462544  -0.20482256]
 [ 0.20052299  0.42655083  0.06608864]
 [ 0.12929739 -0.03485041 -0.38141936]
 [ 0.10774938  0.27645951  0.02540446]
 [ 0.15791868 -0.23633012 -0.20940194]
 [ 0.12953188  0.53613906  0.00417377]
 [ 0.30206975 -0.34666068  0.21589378]
 [-0.44769283  0.20086246  0.23560527]
 [ 0.25415167 -0.14011123  0.17766883]]
X_centered mean should be nearly 0, and it is [-1.11022302e-17  0.00000000e+00  8.88178420e-17]


# Comparisons, Masks and Boolean Logics

In [46]:
rng = np.random.RandomState(0)
x = rng.randint(10, size=(3, 4))
print(x)

[[5 0 3 3]
 [7 9 3 5]
 [2 4 7 6]]


In [50]:
print(x < 6)
print(np.count_nonzero(x<6))
print(np.sum(x < 6, axis=1))

[[ True  True  True  True]
 [False False  True  True]
 [ True  True False False]]
8
[4 2 2]


In [57]:
print(np.any(x>6))
print(np.any(x<0))
print(np.all(x<10))
print(np.all(x<9))

True
False
True
False


In [58]:
print(x[x<5])

[0 3 3 3 2 4]


In [84]:
inches = np.random.exponential(size=365)

# construct a mask of all rainy days
rainy = (inches > 0.5)

# construct a mask of all summer days (June 21st is the 172nd day)
days = np.arange(365)
summer = (days > 172) & (days < 262)

print(f"Median precip on rainy days in 2014 (inches):  {np.median(inches[rainy])} ")
print(f"Median precip on summer days in 2014 (inches): {np.median(inches[summer])} ")
print(f"Maximum precip on summer days in 2014 (inches): {np.max(inches[summer])}")
print(f"Median precip on non-summer rainy days (inches): {np.median(inches[rainy & ~summer])}")

Median precip on rainy days in 2014 (inches):  1.170043150689403 
Median precip on summer days in 2014 (inches): 0.7411600922583893 
Maximum precip on summer days in 2014 (inches): 7.124215700123608
Median precip on non-summer rainy days (inches): 1.1406429665909794


### Booleans
In python, nonzeros are True

In [87]:
print(bool(42 and -1))
print(bool(442 and 0))

True
False


# Fancy Indexing

In [88]:
rand = np.random.RandomState(42)

x = rand.randint(100, size=10)
print(x)

[51 92 14 71 60 20 82 86 74 74]


In [98]:
# indexing
ind = [5, 8, 9]
print(x[ind])

# indexing, with reshaping
ind = np.array([[2, 5], [4, 8]])
print(x[ind])

[20 74 74]
[[14 20]
 [60 74]]


In [101]:
# indexing in multiple dimensions

x = np.arange(12).reshape((3, 4))
print(x)

row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
print(x[row, col]) # so here it chose indexes ([0, 2], [1, 1], [2, 3])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[ 2  5 11]


In [103]:
# Combinining indexing

print(x)
print(x[2, [2, 0, 1]])
print(x[1:, [2, 0, 1]])

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[10  8  9]
[[ 6  4  5]
 [10  8  9]]


In [114]:
# select subsets

mean = [0, 0]
cov = [[1, 2],
       [2, 5]]
X = rand.multivariate_normal(mean, cov, 100)
X.shape

(100, 2)

In [117]:
X[:, 0] #select oth column
X[:, 1] #select 1st column

array([ 2.77836708, -1.88304335, -1.47921549, -2.14076287, -0.42056828,
        1.63413723,  1.50620513,  1.93231826, -3.47121167, -1.16849624,
       -2.55201172, -2.89824929, -0.34078472, -0.75009124, -0.24820629,
        3.6829472 , -6.05035194,  4.48644143, -1.70399713, -0.91645192,
       -4.22266549,  1.93169889,  0.81999939, -1.10036226,  3.94798283,
        4.27998609, -3.41312212,  2.43588887,  0.37236333,  2.08286296,
       -0.13615714,  3.3432017 ,  0.22277882, -2.66405045,  3.83485127,
       -4.29187991, -0.07922742, -0.08765613,  0.29266425,  2.39199866,
        3.96696076,  0.24469594, -0.39107849,  3.1985439 , -1.50301145,
       -0.59593197, -3.46837274,  0.68342421,  2.23341054,  2.67648279,
       -0.05801722, -2.43363766,  0.20767239,  0.82787555, -0.50840429,
       -4.19450858,  0.79681194, -0.0227535 ,  3.43854673, -0.31261669,
        0.85129451,  1.62273537,  0.0920822 ,  0.09195095, -0.40358473,
        3.09622109, -1.54197317, -0.91991757, -6.10051646, -2.71

In [118]:
# modify values with fancy indexing

x = np.arange(10)
i = np.array([2, 1, 8, 4])
x[i] = -2
print(x)

[ 0 -2 -2  3 -2  5  6  7 -2  9]


## Sorting

In [125]:
x = np.random.uniform(0, 50, 10)
print(x)

print(np.sort(x))
print(np.argsort(x))
print(x[np.argsort(x)])

[47.97771098 32.66109375 20.2782741  49.52200925  9.87026792  5.38041294
 35.68671948 23.39320809 35.71284939 27.64965751]
[ 5.38041294  9.87026792 20.2782741  23.39320809 27.64965751 32.66109375
 35.68671948 35.71284939 47.97771098 49.52200925]
[5 4 2 7 9 1 6 8 0 3]
[ 5.38041294  9.87026792 20.2782741  23.39320809 27.64965751 32.66109375
 35.68671948 35.71284939 47.97771098 49.52200925]


In [137]:
# Sort along rows and columns

rand = np.random.RandomState(42)
X = rand.randint(0, 10, (4, 6))
print(X)

print(np.sort(X, axis=0)) # sort along columns
print(np.sort(X, axis=1)) # sort along rows

[[6 3 7 4 6 9]
 [2 6 7 4 3 7]
 [7 2 5 4 1 7]
 [5 1 4 0 9 5]]
[[2 1 4 0 1 5]
 [5 2 5 4 3 7]
 [6 3 7 4 6 7]
 [7 6 7 4 9 9]]
[[3 4 6 6 7 9]
 [2 3 4 6 7 7]
 [1 2 4 5 7 7]
 [0 1 4 5 5 9]]


In [138]:
# Partition sort

x = np.array([7, 2, 3, 1, 6, 5, 4])
print(x)
print(np.partition(x, 3))

print(np.partition(X, 3, axis=1))

[7 2 3 1 6 5 4]
[2 1 3 4 6 5 7]
[[3 4 6 6 7 9]
 [2 3 4 6 7 7]
 [2 1 4 5 7 7]
 [0 4 1 5 9 5]]


In [142]:
# Example K-Nearest neighbours

X = rand.rand(10, 2)
dist_sq = np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)
nearest = np.argsort(dist_sq, axis=1)
print(nearest)

K = 2
nearest_partition = np.argpartition(dist_sq, K + 1, axis=1)
print(nearest_partition)

[[0 2 1 8 9 3 6 4 7 5]
 [1 9 3 2 0 6 8 4 7 5]
 [2 6 9 1 8 0 4 3 7 5]
 [3 9 1 2 0 6 4 8 7 5]
 [4 6 7 5 2 8 9 0 1 3]
 [5 7 4 6 2 8 9 3 1 0]
 [6 4 2 8 7 9 5 1 0 3]
 [7 5 4 6 2 8 9 3 1 0]
 [8 2 6 0 4 9 1 5 7 3]
 [9 1 3 2 6 0 4 8 7 5]]
[[2 0 1 8 9 5 6 7 3 4]
 [1 9 3 2 0 5 6 7 8 4]
 [2 6 9 1 8 0 5 7 3 4]
 [3 9 1 2 0 5 6 7 8 4]
 [7 6 4 5 8 2 9 1 3 0]
 [7 5 4 6 8 2 9 1 3 0]
 [4 6 2 8 7 9 5 1 3 0]
 [7 5 4 6 8 2 9 1 3 0]
 [2 8 6 0 4 5 1 7 3 9]
 [3 1 9 2 6 0 5 7 8 4]]


### Structured Arrays

In [144]:
name = ['Stefan', 'Stephen', 'Steven', 'Steve']
age = [29, 50, 45, 17]
weight = [55.0, 85.5, 68.0, 61.5]

data = np.zeros(4, dtype={'names':('name', 'age', 'weight'),
                          'formats':('U10', 'i4', 'f8')})
print(data.dtype)

data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]
[('Stefan', 29, 55. ) ('Stephen', 50, 85.5) ('Steven', 45, 68. )
 ('Steve', 17, 61.5)]


In [150]:
# different ways of initializing data

np.dtype({'names':('name', 'age', 'weight'),
          'formats':((np.str_, 10), int, np.float32)})

np.dtype([('name', 'S10'), ('age', int), ('weight', np.float32)])


dtype([('name', 'S10'), ('age', '<i8'), ('weight', '<f4')])

In [151]:
# View data as record

data_rec = data.view(np.recarray)
print(data_rec.age)
print(data_rec.name)

[29 50 45 17]
['Stefan' 'Stephen' 'Steven' 'Steve']
