<a href="https://colab.research.google.com/github/alefram/blog-notebooks/blob/main/cuda_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CUDA setup for experiment in C++ and Python



> ## C++

In [14]:
!pip install nvcc4jupyter



## Add plugin

In [15]:
%load_ext nvcc4jupyter

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [18]:
%%cuda
#include <stdio.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
    fflush(stdout); // Add this line to flush the output buffer
}




## Python

In [17]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.1.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.2.2-py3-none-any.whl.metadata (2.9 kB)
Collecting siphash24>=1.6 (from pytools>=2011.2->pycuda)
  Downloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Downloading pytools-2025.2.2-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.1/98.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading siphash24-1.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_6

In [19]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# Kernel de CUDA escrito como una cadena de texto
mod = SourceModule("""
    __global__ void add_vectors(float *dest, float *a, float *b)
    {
        const int i = threadIdx.x + blockIdx.x * blockDim.x;
        dest[i] = a[i] + b[i];
    }
""")

# Obtener la función del kernel
add_vectors = mod.get_function("add_vectors")

# Crear arrays en la CPU
a = np.random.randn(4096).astype(np.float32)
b = np.random.randn(4096).astype(np.float32)
dest = np.zeros_like(a)

# Lanzar el kernel en la GPU
add_vectors(
    drv.Out(dest), drv.In(a), drv.In(b),
    block=(256, 1, 1),
    grid=(16, 1)
)

# Comprobar el resultado
print(f"Resultado en CPU (primer elemento): {a[0] + b[0]:.6f}")
print(f"Resultado en GPU (primer elemento): {dest[0]:.6f}")

Resultado en CPU (primer elemento): 0.418397
Resultado en GPU (primer elemento): 0.418397
