In [1]:
from numba import njit, prange
import numpy as np

@njit(parallel=True)
def add_one_parallel(A):
  # add one to each row in parallel  
  for i in prange(A.shape[0]):
    row = A[i]
    row += 1

A = np.zeros((50, 1000000))
# compile function first time it's called
add_one_parallel(A)

%timeit add_one_parallel(A)
%timeit A + 1

75.1 ms ± 6.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
257 ms ± 65.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
from numba import njit, prange
import numpy as np

@njit
def add_one(A):
  for i in range(A.shape[0]):
    for j in range(A.shape[1]):
      A[i][j] += 1

@njit(parallel=True)
def add_one_parallel(A):
  for i in prange(A.shape[0]):
    for j in prange(A.shape[1]):
      A[i][j] += 1

@njit(parallel=True)
def add_one_parallel2(A):
  for i in prange(A.shape[0]):
    for j in range(A.shape[1]):
      A[i][j] += 1

A = np.zeros((50, 1000000))
# compile function first time it's called
add_one(np.zeros((1, 1)))
add_one_parallel(np.zeros((1, 1)))
add_one_parallel2(np.zeros((1, 1)))

%timeit add_one(A)
%timeit add_one_parallel(A)
%timeit add_one_parallel2(A)
%timeit A + 1

76.4 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
67.8 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
68.2 ms ± 1.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
211 ms ± 30.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
import numpy as np
from numba import njit, prange

def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit
def matrix_multiplication_optimized(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True)
def matrix_multiplication_parallel(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True, fastmath=True)
def matrix_multiplication_parallelf(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    # C[i, k] += A[i, j] * B[j, k]
    C[i] = np.dot(A[i], B)
  return C

m = 10_000_000
n = 20
p = 10
A = np.ones((m, n))
B = np.ones((n, p))

# compile function
matrix_multiplication_optimized(A, B)
matrix_multiplication_parallel(A, B)
matrix_multiplication_parallelf(A, B)

%timeit matrix_multiplication_optimized(A, B)
%timeit matrix_multiplication_parallel(A, B)
%timeit matrix_multiplication_parallelf(A, B)
%timeit A @ B

3.23 s ± 63.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.3 s ± 22.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.2 s ± 31.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.66 s ± 37.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
(1, 2)/(3, 3)

TypeError: unsupported operand type(s) for /: 'tuple' and 'tuple'