In [1]:
import numpy as np
import time
from numba import cuda

In [2]:
@cuda.jit
def add_array(a,b,c):
  i = cuda.grid(1)
  if i < len(a):
    c[i] = a[i] + b[i]

In [3]:
# Define N as the number of elements
N = 1000000000

In [4]:
# Create Numpy arrays for a and b
a = np.arange(N,dtype = np.float32)
b = np.arange(N,dtype = np.float32)

In [5]:
# Allocate a devie array (dev_c) on the GPU using cuda.device_array_like
dev_a = cuda.to_device(a)
dev_b = cuda.to_device(b)
dev_c = cuda.device_array_like(a)

In [6]:
start_time = time.perf_counter()

In [7]:
# Launch the CUDA kernel (add_array) with a configurable grid size and block size
threadsperblock = 256
blockspergrid = (N + threadsperblock - 1) // threadsperblock
add_array[blockspergrid, threadsperblock](dev_a, dev_b, dev_c)

In [8]:
cuda.synchronize()

In [9]:
end_time = time.perf_counter()

In [10]:
c = dev_c.copy_to_host()

In [11]:
execution_time = end_time - start_time
print(f'Execution Time(Kernel + Memory + Transfer) : {execution_time:.4f} seconds')

Execution Time(Kernel + Memory + Transfer) : 0.6552 seconds
