In [None]:
import numpy as np
from numba import cuda


In [None]:

# Define a CUDA kernel to add two arrays
@cuda.jit
def add_arrays_kernel(a, b, result):
    idx = cuda.grid(1)  # Get the absolute thread index
    if idx < a.size:  # Ensure the index is within array bounds
        result[idx] = a[idx] + b[idx]


In [None]:

# Host (CPU) arrays
a = np.random.randn(10).astype(np.float32)
b = np.random.randn(10).astype(np.float32)
result = np.zeros_like(a)


In [None]:

# Transfer arrays to the GPU
a_device = cuda.to_device(a)
b_device = cuda.to_device(b)
result_device = cuda.to_device(result)


In [None]:

# Define the number of threads per block and number of blocks
threads_per_block = 32
blocks_per_grid = (a.size + (threads_per_block - 1)) // threads_per_block


In [None]:

# Launch the kernel on the GPU
add_arrays_kernel[blocks_per_grid, threads_per_block](a_device, b_device, result_device)


In [None]:

# Copy the result back to the host
result = result_device.copy_to_host()

print("Array A:", a)
print("Array B:", b)
print("Result (A + B):", result)