In [3]:
import numpy as np
from numba import njit, prange

@njit()
def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True, fastmath=True)
def matrix_multiplication_parallel(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 1500
n = 1500
p = 1500
A = np.random.randn(m, n)
B = np.random.randn(n, p)
A2 = np.random.randint(1, 100, size=(m, n))
B2 = np.random.randint(1, 100, size=(n, p))
A3 = np.ones((m, n))
B3 = np.ones((n, p))


# compile function
matrix_multiplication(A, B)
matrix_multiplication_parallel(A, B)

print('normal')
%timeit matrix_multiplication_parallel(A, B)
%timeit matrix_multiplication_parallel(A2, B2)
%timeit matrix_multiplication_parallel(A3, B3)
print('parallel')
%timeit matrix_multiplication_parallel(A, B)
%timeit matrix_multiplication_parallel(A2, B2)
%timeit matrix_multiplication_parallel(A3, B3)
print('numpy')
%timeit A @ B
%timeit A2 @ B2
%timeit A3 @ B3

normal
1.53 s ± 163 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.58 s ± 40.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.39 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
parallel
1.36 s ± 59.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.62 s ± 40.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.44 s ± 65.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
numpy
102 ms ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
12.2 s ± 508 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
111 ms ± 21.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [1]:
import numpy as np
from numba import njit, prange

@njit()
def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit(parallel=True, fastmath=True)
def matrix_multiplication_parallel(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in prange(m):
    for j in prange(n):
      for k in prange(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 10000
n = 1000
p = 1000
A = np.random.randn(m, n)
B = np.random.randn(n, p)
A2 = np.random.randint(1, 100, size=(m, n))
B2 = np.random.randint(1, 100, size=(n, p))
A3 = np.ones((m, n))
B3 = np.ones((n, p))

# compile function
matrix_multiplication(A, B)
matrix_multiplication_parallel(A, B)

print('normal')
%timeit matrix_multiplication(A, B)
%timeit matrix_multiplication(A2, B2)
%timeit matrix_multiplication(A3, B3)
print('parallel')
%timeit matrix_multiplication_parallel(A, B)
%timeit matrix_multiplication_parallel(A2, B2)
%timeit matrix_multiplication_parallel(A3, B3)
print('numpy')
%timeit A @ B
%timeit A2 @ B2
%timeit A3 @ B3

normal
14.4 s ± 146 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
14.3 s ± 129 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
14.7 s ± 538 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
parallel
3.34 s ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4.42 s ± 58.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.46 s ± 78.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
numpy
334 ms ± 16.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
19.4 s ± 655 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
248 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [1]:
import numpy as np
from numba import njit

def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication_optimized(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 1000
n = 1000
p = 1000
A = np.random.randn(m, n)
B = np.random.randn(n, p)

# compile function
matrix_multiplication_optimized(A, B)

%timeit matrix_multiplication_optimized(A, B)
%timeit A @ B

1.53 s ± 123 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
31.4 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
import numpy as np
from numba import njit

def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication_optimized(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication_optimized2(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for j in range(n):
    for k in range(p):
      for i in range(m):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication_optimized3(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for k in range(p):
    for i in range(m):
      for j in range(n):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 1000
n = 1000
p = 1000
A = np.random.randn(m, n)
B = np.random.randn(n, p)

# compile function
matrix_multiplication_optimized(A, B)
matrix_multiplication_optimized2(A, B)
matrix_multiplication_optimized3(A, B)


%timeit matrix_multiplication_optimized(A, B)
%timeit matrix_multiplication_optimized2(A, B)
%timeit matrix_multiplication_optimized3(A, B)
%timeit A @ B

1.45 s ± 30.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
12.6 s ± 92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.93 s ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
30 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
import inspect
inspect.getmodule(matrix_multiplication)

<module '__main__'>

In [3]:
np.show_config()

openblas64__info:
    library_dirs = ['D:\\a\\numpy\\numpy\\build\\openblas64__info']
    libraries = ['openblas64__info']
    language = f77
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
blas_ilp64_opt_info:
    library_dirs = ['D:\\a\\numpy\\numpy\\build\\openblas64__info']
    libraries = ['openblas64__info']
    language = f77
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
openblas64__lapack_info:
    library_dirs = ['D:\\a\\numpy\\numpy\\build\\openblas64__lapack_info']
    libraries = ['openblas64__lapack_info']
    language = f77
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
lapack_ilp64_opt_info:
    library_dirs = ['D:\\a\\numpy\\numpy\\build\\openblas64__lapack_info']
    libraries = ['openblas64__lapack_info']
    language = f77
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFI

In [33]:
for v, k in matrix_multiplication.inspect_cfg().items():
  print(v, k)

(array(float64, 2d, C), array(float64, 2d, C)) digraph "CFG for '_ZN8__main__21matrix_multiplicationB3v43B38c8tJTIcFHzwl2ILiXkcBV0KBSgP9CGZpAgA_3dE5ArrayIdLi2E1C7mutable7alignedE5ArrayIdLi2E1C7mutable7alignedE' function" {
	label="CFG for '_ZN8__main__21matrix_multiplicationB3v43B38c8tJTIcFHzwl2ILiXkcBV0KBSgP9CGZpAgA_3dE5ArrayIdLi2E1C7mutable7alignedE5ArrayIdLi2E1C7mutable7alignedE' function";

	Node0x1f3df3829f0 [shape=record,label="{entry:\l  %0 = or i64 %arg.B.5.1, %arg.A.5.0\l  %.not.i = icmp sgt i64 %0, -1\l  br i1 %.not.i, label %B0.endif.endif.endif.endif.i.i, label %B0.if, !prof !0\l|{<s0>T|<s1>F}}"];
	Node0x1f3df3829f0:s0 -> Node0x1f3df387490;
	Node0x1f3df3829f0:s1 -> Node0x1f3df382390;
	Node0x1f3df387490 [shape=record,label="{B0.endif.endif.endif.endif.i.i:                   \l  %.50.i.i = tail call \{ i64, i1 \} @llvm.smul.with.overflow.i64(i64\l... %arg.A.5.0, i64 %arg.B.5.1)\l  %.51.i.i = extractvalue \{ i64, i1 \} %.50.i.i, 0\l  %.52.i.i = extractvalue \{ i64, i1 \} %.50.

In [14]:
import numpy as np
from numba import njit, prange

@njit()
def matrix_multiplication(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for j in range(n):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication2(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for j in range(n):
    for i in range(m):
      for k in range(p):
        C[i, k] += A[i, j] * B[j, k]
  return C

@njit()
def matrix_multiplication3(A, B):
  m, n = A.shape
  _, p = B.shape
  C = np.zeros((m, p))
  for i in range(m):
    for k in range(p):
      for j in range(n):
        C[i, k] += A[i, j] * B[j, k]
  return C

m = 100
n = 100
p = 100
A = np.random.randn(m, n)
B = np.random.randn(n, p)
A2 = np.random.randint(1, 100, size=(m, n))
B2 = np.random.randint(1, 100, size=(n, p))
A3 = np.ones((m, n))
B3 = np.ones((n, p))

# compile function
matrix_multiplication(A, B)
matrix_multiplication2(A, B)
matrix_multiplication3(A, B)

print('normal')
%timeit matrix_multiplication(A, B)
%timeit matrix_multiplication(A2, B2)
%timeit matrix_multiplication(A3, B3)
print('normal')
%timeit matrix_multiplication2(A, B)
%timeit matrix_multiplication2(A2, B2)
%timeit matrix_multiplication2(A3, B3)
print('normal')
%timeit matrix_multiplication3(A, B)
%timeit matrix_multiplication3(A2, B2)
%timeit matrix_multiplication3(A3, B3)
print('numpy')
%timeit A @ B
%timeit A2 @ B2
%timeit A3 @ B3

normal
1.18 ms ± 61.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.36 ms ± 76.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.16 ms ± 6.03 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
normal
1.18 ms ± 8.98 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.48 ms ± 8.46 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.17 ms ± 7.54 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
normal
1.3 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.38 ms ± 37.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.3 ms ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
numpy
231 µs ± 4.42 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.56 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
293 µs ± 51 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
