# Benchmark of Element Wise Matrix Multiplications
On this benchmark we compare several operations using numpy, numexpr, numba (CPU&GPU) and PyTorch GPU.

In [1]:
import sys
import os
import numpy as np
import numexpr as ne
from numba import vectorize
from numba.cuda.cudadrv.error import CudaDriverError
import math
from functools import reduce
import pandas as pd
import torch
from utils import (get_number_processors, get_ram_memory, get_total_gpu_memory, 
                   get_gpu_name, get_cuda_version, get_cudnn_version, AttributeDict,
                   get_object_size, clear_memory_all_gpus)

print("System version: {}".format(sys.version))
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Numexpr version: {}".format(ne.__version__))
print("PyTorch version: {}".format(torch.__version__))
print("BLAS info:") 
print(np.show_config())

%load_ext autoreload
%autoreload 2

System version: 3.6.7 |Anaconda, Inc.| (default, Oct 23 2018, 19:16:44) 
[GCC 7.3.0]
Numpy version: 1.16.0
Pandas version: 0.23.4
Numexpr version: 2.6.9
PyTorch version: 1.0.0
BLAS info:
blas_mkl_info:
  NOT AVAILABLE
blis_info:
  NOT AVAILABLE
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_mkl_info:
  NOT AVAILABLE
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
None


## Helper functions for numpy

In [2]:
def multiply(a,b):
    return a*b

def exponential(a, b):
    return a*np.exp(b)

def sine(a, b):
    return a*np.sin(b)

# A general function that multiplies an arbitrary number of matrices
# is 28% slower than directly multiplying the factors.
# The function multiply_list is not used, just leaving it here for reference
def multiply_list(l):
    return reduce(lambda x, y: x*y, l) 

def multiply3(a, b, c):
    return a*b*c

def multiply5(a, b, c, d, e):
    return a*b*c*d*e

def exponential_sine(a, b, c):
    return a*np.exp(b)*np.sin(c)


## Helper functions for numexpr

In [3]:
def ne_multiply(a,b):
    return ne.evaluate("a*b")

def ne_exponential(a, b):
    return ne.evaluate("a*exp(b)")

def ne_sine(a, b):
    return ne.evaluate("a*sin(b)")

def ne_multiply3(a, b, c):
    return ne.evaluate("a*b*c")

def ne_multiply5(a, b, c, d, e):
    return ne.evaluate("a*b*c*d*e")

def ne_exponential_sine(a, b, c):
    return ne.evaluate("a*exp(b)*sin(c)")



## Helper functions for numba
NOTE: For numba solutions, having a solution empty vector speeds up around 10%
```
r0 = np.empty((S1, S2), dtype=np.int16)
r0 = multicpu(a, b)
```
source: https://devblogs.nvidia.com/numba-python-cuda-acceleration/

In [4]:
@vectorize(["int16(int16, int16)","float32(float32, float32)"], target="cpu")
def multcpu(a, b):
    return a * b

@vectorize(["int16(int16, int16)","float32(float32, float32)"], target="parallel")
def multparal(a, b):
    return a * b

@vectorize(["int16(int16, int16)","float32(float32, float32)"], target="cuda")
def multcuda(a, b):
    return a * b

@vectorize(["float32(float32, float32)"], target="cpu")
def expcpu(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="parallel")
def expparal(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def expcuda(a, b):
    return a*math.exp(b)

@vectorize(["float32(float32, float32)"], target="cpu")
def sincpu(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="parallel")
def sinparal(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32)"], target="cuda")
def sincuda(a, b):
    return a*math.sin(b)

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def multfcpu3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32)"], target="parallel")
def multfparal3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def multfcuda3(a, b, c):
    return a * b * c

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cpu")
def multfcpu5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="parallel")
def multfparal5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32, float32, float32)"], target="cuda")
def multfcuda5(a, b, c, d, e):
    return a * b * c * d * e

@vectorize(["float32(float32, float32, float32)"], target="cpu")
def expsincpu(a, b, c):
    return a*math.exp(b)*math.sin(c)

@vectorize(["float32(float32, float32, float32)"], target="parallel")
def expsinparal(a, b, c):
    return a*math.exp(b)*math.sin(c)

@vectorize(["float32(float32, float32, float32)"], target="cuda")
def expsincuda(a, b, c):
    return a*math.exp(b)*math.sin(c)



## Helper functions for PyTorch

*Note on performance*: 

`torch.as_tensor(a)` does not make a copy of a on CPU. Adding `.cuda()` copies the array to GPU memory.

More info: https://pytorch.org/docs/stable/tensors.html

In [5]:
def pt_multiply(a,b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*bt

def pt_exponential(a, b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*torch.exp(bt)

def pt_sine(a, b):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    return at*torch.sin(bt)

def pt_multiply3(a, b, c):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    return at*bt*ct

def pt_multiply5(a, b, c, d, e):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    dt = torch.as_tensor(d).cuda()
    et = torch.as_tensor(e).cuda()
    return at*bt*ct*dt*et

def pt_exponential_sine(a, b, c):
    at = torch.as_tensor(a).cuda() 
    bt = torch.as_tensor(b).cuda()
    ct = torch.as_tensor(c).cuda()
    return at*torch.exp(bt)*torch.sin(ct)


## Parameters

In [6]:
size_combinations=[
    (100, 100),
    (1000, 1000),
    (10000, 10000),
    (100000, 10000),
    (100000, 100000)
]

In [7]:
columns = ["n_processors",
           "cpu_memory",
           "gpu_name",
           "gpu_memory",
           "data_type",
           "size1",
           "size2",
           "operation",
           "numpy",
           "numexpr",
           "numba_cpu",
           "numba_paral",
           "numba_gpu",
           "pytorch"]

In [8]:
n_processors = get_number_processors()
cpu_memory = get_ram_memory(units="Gb")
gpu_name = get_gpu_name()[0]
gpu_memory = get_total_gpu_memory(units="Gb")[0]
header = [n_processors, cpu_memory, gpu_name, gpu_memory]

In [9]:
filebase = gpu_name.replace(" ", "-")
filebase

'Tesla-V100-PCIE-16GB'

In [10]:
folder = "data"
os.makedirs(folder, exist_ok=True)

## Data

In [11]:
def factors_int(s1=100, s2=100):
    a = np.random.randint(1, 5, (s1, s2), dtype=np.int16)
    b = np.random.randint(1, 10, (s1, s2), dtype=np.int16)
    return a, b

def factors_float(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    return a, b

def factors_float3(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    return a, b, c

def factors_float5(s1=100, s2=100):
    a = np.random.randn(s1, s2).astype(np.float32)
    b = np.random.randn(s1, s2).astype(np.float32)
    c = np.random.uniform(low=0, high=10, size=(s1,s2)).astype(np.float32)
    d = np.random.uniform(low=5, high=15, size=(s1,s2)).astype(np.float32)
    e = np.random.uniform(low=0, high=30, size=(s1,s2)).astype(np.float32)
    return a, b, c, d, e

Checking data sizes in Gb

In [23]:
a, _ = factors_int(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))
a, _ = factors_float(size_combinations[-1][0], size_combinations[-1][1])
print(get_object_size(a, units="Gb"))

18.6264515966177
37.25290308892727


## Benchmark

#### Integer matrix multiplication

In [24]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_int(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multcpu(a,b)
    r4 = %timeit -o multparal(a,b)
    try:
        r5 = %timeit -o multcuda(a,b)
        clear_memory_all_gpus()
    except CudaDriverError: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r6 = %timeit -o pt_multiply(a,b)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row


2.04 µs ± 27.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
464 µs ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.9 µs ± 2.04 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
20.7 µs ± 111 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.37 ms ± 66 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
87.8 µs ± 18.2 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

215 µs ± 2.78 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
583 µs ± 13.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
215 µs ± 2.97 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
65.5 µs ± 532 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
4.63 ms ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
590 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

91.4 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
21 ms ± 396 µs per loop (mean ± std. dev. of 7 runs, 10 loops ea

In [25]:
filename = filebase + "_" + operation + "_int" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_paral,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100,100,a*b,2e-06,0.000464,2e-06,2.1e-05,0.00137052,8.77816e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,1000,1000,a*b,0.000215,0.000583,0.000215,6.5e-05,0.00463451,0.000590458
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,10000,10000,a*b,0.091427,0.021019,0.088133,0.014701,0.150002,0.0537218
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,10000,a*b,0.894855,0.163243,0.932832,0.115164,1.44249,0.520506
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.int16'>,100000,100000,a*b,8.982922,1.75122,8.841003,1.366295,OOM,OOM


#### Float matrix multiplication

In [26]:
df = pd.DataFrame(columns=columns)
operation = "a*b"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o multiply(a,b)
    r2 = %timeit -o ne_multiply(a,b)
    r3 = %timeit -o multcpu(a,b)
    r4 = %timeit -o multparal(a,b)
    try:
        r5 = %timeit -o multcuda(a,b)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r6 = %timeit -o pt_multiply(a,b)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

2.99 µs ± 22.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
455 µs ± 13.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.08 µs ± 5.05 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
21.7 µs ± 469 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.47 ms ± 3.31 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
84.9 µs ± 262 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

399 µs ± 3.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
551 µs ± 8.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
428 µs ± 2.23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
92.8 µs ± 2.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
5.23 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
952 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

175 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.5 ms ± 385 µs per loop (mean ± std. dev. of 7 runs, 

In [27]:
filename = filebase + "_" + operation + "_float" + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_paral,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*b,3e-06,0.000455,3e-06,2.2e-05,0.00146891,8.48509e-05
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*b,0.000399,0.000551,0.000428,9.3e-05,0.00522768,0.00095236
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*b,0.175345,0.031546,0.175785,0.030825,0.283723,0.100704
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*b,1.735445,0.294421,1.740948,0.289153,2.82843,0.9301
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*b,17.455325,2.847748,17.439472,2.878136,OOM,OOM


#### Exponential matrix multiplication

In [28]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o exponential(a,b)
    r2 = %timeit -o ne_exponential(a,b)
    r3 = %timeit -o expcpu(a,b)
    r4 = %timeit -o expparal(a,b)
    try:
        r5 = %timeit -o expcuda(a,b)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2)) 
    try:
        r6 = %timeit -o pt_exponential(a,b)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

89.2 µs ± 214 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
463 µs ± 12.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
106 µs ± 98 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
31.2 µs ± 600 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.46 ms ± 3.63 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
103 µs ± 298 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

8.71 ms ± 5.59 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
844 µs ± 43.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
10.5 ms ± 43.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
523 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.16 ms ± 32.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
969 µs ± 1.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

1 s ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
47.6 ms ± 514 µs per loop (mean ± std. dev. of 7 runs, 10 loops each

In [29]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_paral,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*exp(b),8.9e-05,0.000463,0.000106,3.1e-05,0.00146372,0.000102519
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*exp(b),0.008707,0.000844,0.010516,0.000523,0.00516015,0.000969088
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*exp(b),1.001797,0.047643,1.145451,0.051389,0.286889,0.102657
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*exp(b),10.033744,0.452116,11.474779,0.505501,2.83925,OOM
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*exp(b),100.236637,4.384262,114.725856,5.042384,OOM,OOM


#### Sine matrix multiplication

In [30]:
df = pd.DataFrame(columns=columns)
operation = "a*sin(b)"
for s1, s2 in size_combinations:
    a, b = factors_float(s1, s2)
    r1 = %timeit -o sine(a,b)
    r2 = %timeit -o ne_sine(a,b)
    r3 = %timeit -o sincpu(a,b)
    r4 = %timeit -o sinparal(a,b)
    try:
        r5 = %timeit -o sincuda(a,b)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    try:
        r6 = %timeit -o pt_sine(a,b)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

83.2 µs ± 165 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
471 µs ± 9.66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
99 µs ± 768 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
30.7 µs ± 651 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.45 ms ± 4.18 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
101 µs ± 901 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

8.51 ms ± 1.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
800 µs ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
10.8 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
533 µs ± 9.21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
5.18 ms ± 60 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
966 µs ± 3.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

981 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
45.7 ms ± 457 µs per loop (mean ± std. dev. of 7 runs, 10 loops ea

In [31]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_paral,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*sin(b),8.3e-05,0.000471,9.9e-05,3.1e-05,0.00145325,0.000100636
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*sin(b),0.00851,0.0008,0.01076,0.000533,0.00518061,0.000966255
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*sin(b),0.981292,0.045746,1.174466,0.051767,0.286078,0.100841
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*sin(b),9.929282,0.427063,11.722498,0.509446,2.82914,OOM
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*sin(b),98.50451,4.186946,117.061011,5.078236,OOM,OOM


#### Multiple matrix multiplication (3 factors)

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o multiply3(a,b,c)
    r2 = %timeit -o ne_multiply3(a,b,c)
    r3 = %timeit -o multfcpu3(a,b,c)
    r4 = %timeit -o multfparal3(a,b,c)
    try:
        r5 = %timeit -o multfcuda3(a,b,c)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r6 = %timeit -o pt_multiply3(a,b,c)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

#### Multiple matrix multiplication (5 factors)

In [None]:
df = pd.DataFrame(columns=columns)
operation = "a*b*c*d*e"
for s1, s2 in size_combinations:
    a, b, c, d, e = factors_float5(s1, s2)
    r1 = %timeit -o multiply5(a,b,c,d,e)
    r2 = %timeit -o ne_multiply5(a,b,c,d,e)
    r3 = %timeit -o multfcpu5(a,b,c,d,e)
    r4 = %timeit -o multfparal5(a,b,c,d,e)
    try:
        r5 = %timeit -o multfcuda5(a,b,c,d,e)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    try:
        r6 = %timeit -o pt_multiply5(a,b,c,d,e)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

In [None]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

#### Exponential sine matrix multiplication

In [12]:
df = pd.DataFrame(columns=columns)
operation = "a*exp(b)*sin(c)"
for s1, s2 in size_combinations:
    a, b, c = factors_float3(s1, s2)
    r1 = %timeit -o exponential_sine(a,b,c)
    r2 = %timeit -o ne_exponential_sine(a,b,c)
    r3 = %timeit -o expsincpu(a,b,c)
    r4 = %timeit -o expsinparal(a,b,c)
    try:
        r5 = %timeit -o expsincuda(a,b,c)
        clear_memory_all_gpus()
    except: # in case of Out Of Memory (OOM)
        r5 = AttributeDict()
        r5["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))        
    try:
        r6 = %timeit -o pt_exponential_sine(a,b,c)
        clear_memory_all_gpus()
    except RuntimeError: # in case of Out Of Memory (OOM)
        r6 = AttributeDict()
        r6["average"] = "OOM"
        print("OOM for size ({},{})".format(s1, s2))
    print("")
    row = header + [type(a[0,0]), s1, s2, operation, r1.average, r2.average, r3.average, r4.average, r5.average, r6.average]
    df.loc[len(df)] = row

215 µs ± 740 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
477 µs ± 15 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
247 µs ± 2.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
44.9 µs ± 917 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.99 ms ± 12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
174 µs ± 13.3 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

21.4 ms ± 34.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.37 ms ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
24.5 ms ± 41 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
1.16 ms ± 22.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
7.35 ms ± 286 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.46 ms ± 5.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

2.38 s ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
100 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
2.58 s

In [13]:
filename = filebase + "_" + operation + ".csv"
df.to_csv(os.path.join(folder, filename), index=False)
df

Unnamed: 0,n_processors,cpu_memory,gpu_name,gpu_memory,data_type,size1,size2,operation,numpy,numexpr,numba_cpu,numba_paral,numba_gpu,pytorch
0,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100,100,a*exp(b)*sin(c),0.000215,0.000477,0.000247,4.5e-05,0.00198992,0.00017382
1,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,1000,1000,a*exp(b)*sin(c),0.021384,0.001372,0.024456,0.001157,0.00735039,0.00146446
2,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,10000,10000,a*exp(b)*sin(c),2.383109,0.100398,2.577221,0.112853,0.355643,0.149033
3,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,10000,a*exp(b)*sin(c),23.818136,1.019712,25.803186,1.120956,OOM,OOM
4,24,440.909752,Tesla V100-PCIE-16GB,15.781738,<class 'numpy.float32'>,100000,100000,a*exp(b)*sin(c),238.690387,9.864431,258.552602,11.205123,OOM,OOM
