# Benchmarking NumPy and CuPy

* CuPy is an open source library for GPU-accelerated computing with Python. It shares the same API set as NumPy and SciPy, allowing it to be a drop-in replacement to run NumPy/SciPy code on GPU.
* "Cu" in CuPy stands for Nvidia's CUDA framework which allows program to perform general-purpose calculations on GPU.

In [2]:
import numpy as np
import time

def generate_matrices(m: int, k: int, n: int):
    A = np.random.uniform(-1, 1, size=(m, k)).astype(np.float32)
    B = np.random.uniform(-1, 1, size=(k, n)).astype(np.float32)
    return (A, B)

# Dimensions of matrices
m = 37000
k = 23000
n = 18000

# some magic numbers, just to make calculation heavier
sqrt2 = 1.414
pi = 3.141
e = 2.718

In [3]:
A, B = generate_matrices(m, k, n)
print(A.shape)
print(B.shape)

(37000, 23000)
(23000, 18000)


## NumPy version

In [4]:
import cpuinfo
print('Checking GPU info...')
cpu_info = cpuinfo.get_cpu_info()
print(cpu_info['brand'])

Checking GPU info...


Intel(R) Core(TM) i5-7500 CPU @ 3.40GHz


In [5]:
np_A = A.copy()
np_B = B.copy()

t0 = time.time()
np_A = np.log(np_A + pi)
np_B = (np_B + sqrt2) ** e
np_C = np.dot(np_A, np_B)
np_C_mean = np.mean(np_C, axis=0)
np_C_norm = np_C_mean / np.linalg.norm(np_C_mean)

t1 = time.time()


In [6]:
print(len(np_C_norm))
print(np_C_norm)
print(f'{(t1 - t0):,.03f}sec')

18000
[0.00754171 0.00748009 0.00756145 ... 0.0073882  0.00749193 0.0074358 ]
78.821sec


## CuPy version

In [7]:
import cupy as cp

print('Checking GPU info...')
if cp.cuda.is_available():
    device_id = cp.cuda.runtime.getDevice()
    device_properties = cp.cuda.runtime.getDeviceProperties(device_id)
    print(f"Model:  {device_properties['name'].decode()}")
    print(f"Memory: {device_properties['totalGlobalMem']/(2**30):.1f}GB")
else:
    raise RuntimeError('GPU is not available')

Checking GPU info...
Model:  NVIDIA GeForce RTX 3060
Memory: 11.8GB


In [8]:
t0 = time.time()
# Move data from main memory to GPU memory
cp_A = cp.asarray(A)
cp_B = cp.asarray(B)

t1 = time.time()
# Calculation
cp_A = cp.log(cp_A + pi)
cp_B = (cp_B + sqrt2) ** e
cp_C = cp.dot(cp_A, cp_B) 
cp_C_mean = np.mean(cp_C, axis=0)
cp_C_norm = cp_C_mean / cp.linalg.norm(cp_C_mean)

t2 = time.time()
# Move data from GPU memory back to CPU memory
# Note that the result from GPU has to be stored back to NumPy array,
# so the variable is named np_C_norm2 instead of cp_C_norm
np_C_norm2 = cp.asnumpy(cp_C_norm)

t3 = time.time()

In [9]:
print(len(np_C_norm2))
print(np_C_norm2)
print(f'Send data to GPU:       {(t1 - t0):,.3f}sec')
print(f'Calculation:            {(t2 - t1):,.3f}sec')
print(f'Retrieve data from GPU: {(t3 - t2):,.3f}sec')
print('=====')
print(f'Total:                  {(t3 - t0):,.3f}sec')


18000
[0.00754169 0.00748009 0.00756147 ... 0.00738818 0.00749193 0.00743583]
Send data to GPU:       2.458sec
Calculation:            3.517sec
Retrieve data from GPU: 0.000sec
=====
Total:                  5.975sec


## Results consistency verification

In [10]:
tolerance = 1e-7
np.allclose(np_C_norm, np_C_norm2, atol=tolerance)

True