In [1]:
import multiprocessing

multiprocessing.cpu_count()

8

In [2]:
from multiprocessing import Pool

Pool(), Pool(5), Pool(10)

(<multiprocessing.pool.Pool state=RUN pool_size=8>,
 <multiprocessing.pool.Pool state=RUN pool_size=5>,
 <multiprocessing.pool.Pool state=RUN pool_size=10>)

In [3]:
from multiprocessing import Pool, current_process

def f(x):
  print(current_process())
  return x * x

with Pool() as p:
  arguments = [1, 2, 3, 4, 5, 6]
  results = p.map(f, arguments)
  print(results)

In [3]:
import numpy as np

a = np.ones((1000, 10))
for row in a:
  print(row)
  break

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [6]:
from numba import njit, prange
import numpy as np

@njit(parallel=True)
def add_one_parallel(A):
  # add one to each row in parallel  
  for i in prange(A.shape[0]):
    row = A[i]
    row += 1

A = np.zeros((50, 1000000))
# compile function first time it's called
add_one_parallel(A)

%timeit A + 1
%timeit add_one_parallel(A)

196 ms ± 18.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
74.4 ms ± 1.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
from numba import njit, prange
import numpy as np

@njit(parallel=True)
def add_one_parallel(A):
  for i in prange(A.shape[0]):
    for j in prange(A.shape[1]):
      A[i][j] += 1

A = np.zeros((50, 1000000))
# compile function first time it's called
add_one_parallel(A)

%timeit A + 1
%timeit add_one_parallel(A)

282 ms ± 69.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
83.6 ms ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
import numpy as np
from numba import njit, prange

@njit(parallel=True)
def mat_mult(A, B):
    assert A.shape[1] == B.shape[0]
    res = np.zeros((A.shape[0], B.shape[1]), )
    for i in prange(A.shape[0]):
        for k in range(A.shape[1]):
            for j in range(B.shape[1]):
                res[i,j] += A[i,k] * B[k,j]
    return res

m, n, c = 1000, 1500, 1200
A = np.random.randint(1, 1000, size = (m, n))
B = np.random.randint(1, 1000, size = (n, c))
A2 = np.random.randn(m, n)
B2 = np.random.randn(n, c)
# A = np.ones((m, n))
# B = np.ones((n, c))
mat_mult(A, B)
%timeit np.dot(A, B)
%timeit mat_mult(A, B)
%timeit np.dot(A2, B2)
%timeit mat_mult(A2, B2)

4.57 s ± 29.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
827 ms ± 35.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
54.9 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
896 ms ± 131 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
import numpy as np
from numba import njit, prange

def matrix_multiplication(A, B):
  m = A.shape[0]
  n = A.shape[1]
  p = B.shape[1]
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True)
def matrix_multiplication2(A, B):
  m = A.shape[0]
  n = A.shape[1]
  p = B.shape[1]
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 100000
n = 20
p = 10
A = np.ones((m, n))
B = np.ones((n, p))

# compile function
matrix_multiplication2(A, B)

%timeit matrix_multiplication(A, B)
%timeit matrix_multiplication2(A, B)
%timeit np.dot(A, B)

14.4 s ± 1.14 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
17.2 ms ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
18.4 ms ± 2.52 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
import numpy as np
from numba import njit, prange

@njit(parallel=True)
def matrix_multiplication2(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True)
def matrix_multiplication3(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True)
def matrix_multiplication4(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 1000000
n = 50
p = 20
A = np.random.randn(m, n)
B = np.random.randn(n, p)

# compile function
matrix_multiplication2(A, B)
matrix_multiplication3(A, B)
matrix_multiplication4(A, B)

%timeit matrix_multiplication2(A, B)
%timeit matrix_multiplication3(A, B)
%timeit matrix_multiplication4(A, B)
%timeit np.dot(A, B)

453 ms ± 65.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
475 ms ± 86.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
518 ms ± 70.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
676 ms ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
@njit(parallel=True, fastmath=True)
def matrix_multiplication2f(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True, fastmath=True)
def matrix_multiplication3f(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 1500
n = 1500
p = 1500
A = np.random.randn(m, n)
B = np.random.randn(n, p)
A2 = np.random.randint(1, 100, size=(m, n))
B2 = np.random.randint(1, 100, size=(n, p))
A3 = np.ones((m, n))
B3 = np.ones((n, p))


# compile function
matrix_multiplication2(A, B)
matrix_multiplication2f(A, B)
matrix_multiplication3(A, B)
matrix_multiplication3f(A, B)

print('parallel')
%timeit matrix_multiplication2(A, B)
%timeit matrix_multiplication2(A2, B2)
%timeit matrix_multiplication2(A3, B3)
print('fastmath')
%timeit matrix_multiplication2f(A, B)
%timeit matrix_multiplication2f(A2, B2)
%timeit matrix_multiplication2f(A3, B3)
print('parallel')
%timeit matrix_multiplication3(A, B)
%timeit matrix_multiplication3(A2, B2)
%timeit matrix_multiplication3(A3, B3)
print('fastmath')
%timeit matrix_multiplication3f(A, B)
%timeit matrix_multiplication3f(A2, B2)
%timeit matrix_multiplication3f(A3, B3)
print('numpy')
%timeit A @ B
%timeit A2 @ B2
%timeit A3 @ B3

parallel
1.7 s ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.84 s ± 61.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.75 s ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
fastmath
1.53 s ± 42.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.78 s ± 27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.51 s ± 51.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
parallel
1.71 s ± 49.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.77 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.64 s ± 53.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
fastmath
1.55 s ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.8 s ± 39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.6 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
numpy
117 ms ± 2.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
10.5 s ± 75.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
102 ms ± 5.65 