In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.2.2-py3-none-any.whl.metadata (2.9 kB)
Collecting siphash24>=1.6 (from pytools>=2011.2->pycuda)
  Downloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Downloading pytools-2025.2.2-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.1/98.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.6/105.6 kB[0m [31m12.1 MB/s[0

In [3]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

# CUDA kernel: each thread adds one element
kernel_code = """
__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}
"""

# Compile the kernel
mod = SourceModule(kernel_code)
vector_add = mod.get_function("vectorAdd")

# Problem size
define = 1 << 20  # 1M elements
N = 1 << 20

# Host arrays
h_A = np.arange(N, dtype=np.float32)
h_B = 2 * np.arange(N, dtype=np.float32)
h_C = np.empty_like(h_A)

# Allocate device memory
d_A = cuda.mem_alloc(h_A.nbytes)
d_B = cuda.mem_alloc(h_B.nbytes)
d_C = cuda.mem_alloc(h_C.nbytes)

# Copy inputs to device
cuda.memcpy_htod(d_A, h_A)
cuda.memcpy_htod(d_B, h_B)

# Launch parameters
threads_per_block = 256
blocks_per_grid = (N + threads_per_block - 1) // threads_per_block

# Launch kernel
vector_add(
    d_A, d_B, d_C, np.int32(N),
    block=(threads_per_block, 1, 1),
    grid=(blocks_per_grid, 1, 1)
)

# Copy result back to host
cuda.memcpy_dtoh(h_C, d_C)

# Verify results
for i in range(5):
    expected = h_A[i] + h_B[i]
    if abs(h_C[i] - expected) > 1e-5:
        print("FAIL at index", i)
        break
else:
    print("PASS")


PASS
