In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.7 MB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m26.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.1-py3-none-any.whl.metadata (3.0 kB)
Downloading pytools-2025.1.1-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m8.4 MB/s[0m eta [3

In [2]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray, compiler

# Define the CUDA kernel (vector addition)
kernel_code = """
__global__ void vec_add(float *a, float *b, float *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
"""

# Compile the kernel
mod = compiler.SourceModule(kernel_code)
vec_add = mod.get_function("vec_add")

# Input data
n = 10
a = np.random.randn(n).astype(np.float32)
b = np.random.randn(n).astype(np.float32)
c = np.zeros_like(a)

# Allocate GPU memory and copy data
a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.empty_like(a_gpu)

# Launch kernel (1 block, 256 threads per block)
block_size = 256
grid_size = (n + block_size - 1) // block_size
vec_add(a_gpu, b_gpu, c_gpu, np.int32(n), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copy result back to CPU and verify
c = c_gpu.get()
print("Input A:", a)
print("Input B:", b)
print("Output C:", c)

Input A: [-1.7712677   0.8426537   0.4673768   0.06064554 -2.0816793  -0.28908092
 -0.3796753  -0.67691046 -1.03661     1.3708297 ]
Input B: [-1.792802   -1.3222631  -0.1057713  -0.67127013  0.2729242  -0.7654272
 -1.8327522  -0.18640576  1.3549472  -0.06982907]
Output C: [-3.5640697  -0.47960943  0.3616055  -0.6106246  -1.8087552  -1.0545081
 -2.2124276  -0.86331624  0.3183372   1.3010006 ]
