In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.3-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting appdirs>=1.4.0 (from pycuda)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
  

In [2]:
%%writefile hello.py
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

cuda_code="""
__global__ void return_data(char *output){
  const char msg[]="hello-world";
  int idx=threadIdx.x + blockIdx.x * blockDim.x;
  if(idx < sizeof(msg)){
    output[idx]=msg[idx];
  }
}
"""

cuda_module=SourceModule(cuda_code)
return_data_kernel=cuda_module.get_function("return_data")

output_size=15  #length of the "hello-world" string + null terminator
output_gpu = cuda.mem_alloc(output_size * np.dtype(np.uint8).itemsize)

block_dim=(32,1,1)
grid_dim=(1,1)

output_host=np.empty(output_size,dtype=np.uint8)
output_ptr = cuda.to_device(output_host)

return_data_kernel(output_ptr, block=block_dim, grid=grid_dim)

cuda.memcpy_dtoh(output_host,output_ptr)

output_str= ''.join(chr(c)for c in output_host)
print(output_str)

Writing hello.py


In [3]:
!python hello.py

hello-world    


In [16]:
%%writefile pycuda_add.py
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

#CUDA kernel code for array addition
cuda_code = """
__global__ void add_arrays(int *a,int *b,int *c,int size) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if(tid < size) {
    c[tid] = a[tid] + b[tid];
  }
}
"""

#Host data
a_host = np.array([1,2,3], dtype=np.int32)
b_host = np.array([4,5,6], dtype=np.int32)
size = len(a_host)

#Device data
a_device = cuda.mem_alloc(a_host.nbytes)
b_device = cuda.mem_alloc(b_host.nbytes)
c_device = cuda.mem_alloc(a_host.nbytes)

#Copy data to device
cuda.memcpy_htod(a_device, a_host)
cuda.memcpy_htod(b_device, b_host)

#Load the CUDA module
cuda_module = SourceModule(cuda_code)
add_arrays_kernel = cuda_module.get_function("add_arrays")

#Set up block and grid dimensions
block_dim = (size,1, 1)
grid_dim = (1,1)

#Launch the CUDA kernel
add_arrays_kernel(a_device, b_device, c_device, np.int32(size), block=block_dim, grid=grid_dim)

#Copy the result back to the host
c_host = np.empty_like(a_host)
cuda.memcpy_dtoh(c_host, c_device)

#Display the result
print("Array A: ",a_host)
print("Array B: ",b_host)
print("Result Array C: ",c_host)

Overwriting pycuda_add.py


In [17]:
!python pycuda_add.py

Array A:  [1 2 3]
Array B:  [4 5 6]
Result Array C:  [5 7 9]


In [33]:
%%writefile pycuda_addition.py
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import time

# CUDA kernel code for array addition
cuda_code = """
__global__ void add_arrays(float *a, float *b, float *c, int size) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if(tid < size) {
    c[tid] = a[tid] + b[tid];
  }
}
"""

# CPU array addition
def cpu_add_arrays(a, b):
  return a + b

# GPU array addition
def gpu_add_arrays(a_gpu, b_gpu, result_gpu, size):
  block_size = 256
  grid_size = (size + block_size - 1) // block_size

  add_arrays_cuda(a_gpu, b_gpu, result_gpu, np.int32(size), block=(block_size, 1, 1), grid=(grid_size, 1))

# Generate random arrays
size = 27000 * 27000
a_cpu = np.random.rand(size).astype(np.float32)
b_cpu = np.random.rand(size).astype(np.float32)
result_cpu = np.zeros_like(a_cpu)

# Allocate GPU memory
a_gpu = cuda.mem_alloc(a_cpu.nbytes)
b_gpu = cuda.mem_alloc(b_cpu.nbytes)
result_gpu = cuda.mem_alloc(result_cpu.nbytes)

# Copy data to GPU
cuda.memcpy_htod(a_gpu, a_cpu)
cuda.memcpy_htod(b_gpu, b_cpu)

# Compile the CUDA code
mod = SourceModule(cuda_code)
add_arrays_cuda = mod.get_function("add_arrays")

# Perform CPU array addition and measure time
start_time_cpu = time.time()
result_cpu = cpu_add_arrays(a_cpu, b_cpu)
end_time_cpu = time.time()
time_cpu = end_time_cpu - start_time_cpu

# Perform GPU array addition and measure time
start_time_gpu = time.time()
gpu_add_arrays(a_gpu, b_gpu, result_gpu, size)
cuda.Context.synchronize()
end_time_gpu = time.time()
time_gpu = end_time_gpu - start_time_gpu

# Copy result from GPU to host
cuda.memcpy_dtoh(result_cpu, result_gpu)

# Display result and time taken
print(f"Time taken on CPU: {time_cpu} seconds")
print(f"Time taken on GPU: {time_gpu} seconds")

Overwriting pycuda_addition.py


In [34]:
!python pycuda_addition.py

Time taken on CPU: 1.1193950176239014 seconds
Time taken on GPU: 0.03608560562133789 seconds


In [37]:
%%writefile pycuda_one_dimention.py
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import time

# CUDA kernel code for array addition
cuda_code = """
__global__ void add_arrays(float *a, float *b, float *c, int size) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if(tid < size) {
    c[tid] = a[tid] + b[tid];
  }
}
"""

# CPU array addition
def cpu_add_arrays(a, b):
  return a + b

# GPU array addition
def gpu_add_arrays(a_gpu, b_gpu, result_gpu, size):
  block_size = 256
  grid_size = (size + block_size - 1) // block_size

  add_arrays_cuda(a_gpu, b_gpu, result_gpu, np.int32(size), block=(block_size, 1, 1), grid=(grid_size, 1))

# Generate random arrays
size = 27000 * 27000
a_cpu = np.random.rand(size).astype(np.float32)
b_cpu = np.random.rand(size).astype(np.float32)
result_cpu = np.zeros_like(a_cpu)

# Allocate GPU memory
a_gpu = cuda.mem_alloc(a_cpu.nbytes)
b_gpu = cuda.mem_alloc(b_cpu.nbytes)
result_gpu = cuda.mem_alloc(result_cpu.nbytes)

# Copy data to GPU
cuda.memcpy_htod(a_gpu, a_cpu)
cuda.memcpy_htod(b_gpu, b_cpu)

# Compile the CUDA code
mod = SourceModule(cuda_code)
add_arrays_cuda = mod.get_function("add_arrays")

# Perform CPU array addition and measure time
start_time_cpu = time.time()
result_cpu = cpu_add_arrays(a_cpu, b_cpu)
end_time_cpu = time.time()
time_cpu = end_time_cpu - start_time_cpu

# Perform GPU array addition and measure time
start_time_gpu = time.time()
gpu_add_arrays(a_gpu, b_gpu, result_gpu, size)
cuda.Context.synchronize()
end_time_gpu = time.time()
time_gpu = end_time_gpu - start_time_gpu

# Copy result from GPU to host
cuda.memcpy_dtoh(result_cpu, result_gpu)

# Display result and time taken
print(f"Time taken on CPU: {time_cpu} seconds")
print(f"Time taken on GPU: {time_gpu} seconds")

Overwriting pycuda_one_dimention.py


In [38]:
!python pycuda_one_dimention.py

Time taken on CPU: 1.136056900024414 seconds
Time taken on GPU: 0.03370523452758789 seconds
