__[nvidia cuda python learning](https://github.com/apowers313/roc/blob/master/experiments/2024.09.02-17.58.36-nvidia-cuda-python-learning/2024.09.02-17.58.36-nvidia-cuda-python-learning.ipynb)__

In [None]:
!date

In [None]:
# save notebook path before we get started
import os

notebook_path = os.path.abspath("") # not sure if this or os.getcwd() is more reliable

# Device Info

In [None]:
from baracuda import CudaDevice

print("Device Count:", CudaDevice.count())

dev = CudaDevice(0)
print("Device Name:", dev.name)
print("Compute Capability:", dev.compute_capability)
print("Driver Version:", dev.driver_version)

# Simple Dumb Kernel

In [1]:
from baracuda import CudaSourceFile

mod = CudaSourceFile("test_kernel.cu")
mod.call("test_kernel")

getting default context
creating context
CODE:
-------
extern "C" {
#include "test.h"

__global__ void test_kernel() {
  printf("(%d, %d, %d): Block (%d, %d, %d), Thread (%d, %d, %d) -- %d\n", MY_X,
         MY_Y, MY_Z, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,
         threadIdx.y, threadIdx.z, MY_THING);
}
}

-------

Compilation results 
getting default context
Calling function: test_kernel
getting default context


(0, 0, 0): Block (0, 0, 0), Thread (0, 0, 0) -- 42


# Arguments

From example code:

---

https://nvidia.github.io/cuda-python/overview.html#

``` python
NUM_THREADS = 512  # Threads per block
NUM_BLOCKS = 32768  # Blocks per grid

a = np.array([2.0], dtype=np.float32)
n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)
bufferSize = n * a.itemsize

hX = np.random.rand(n).astype(dtype=np.float32)
hY = np.random.rand(n).astype(dtype=np.float32)
hOut = np.zeros(n).astype(dtype=np.float32)

dXclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
dYclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
dOutclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))

stream = checkCudaErrors(cuda.cuStreamCreate(0))

checkCudaErrors(cuda.cuMemcpyHtoDAsync(dXclass, hX.ctypes.data, bufferSize, stream))
checkCudaErrors(cuda.cuMemcpyHtoDAsync(dYclass, hY.ctypes.data, bufferSize, stream))

# The following code example is not intuitive
# Subject to change in a future release
dX = np.array([int(dXclass)], dtype=np.uint64)
dY = np.array([int(dYclass)], dtype=np.uint64)
dOut = np.array([int(dOutclass)], dtype=np.uint64)

args = [a, dX, dY, dOut, n]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

checkCudaErrors(
    cuda.cuLaunchKernel(
        kernel,
        NUM_BLOCKS,  # grid x dim
        1,  # grid y dim
        1,  # grid z dim
        NUM_THREADS,  # block x dim
        1,  # block y dim
        1,  # block z dim
        0,  # dynamic shared memory
        stream,  # stream
        args.ctypes.data,  # kernel arguments
        0,  # extra (ignore)
    )
)
```

---

or:

https://github.com/NVIDIA/cuda-python/blob/main/examples/3_CUDA_Features/simpleCudaGraphs_test.py


In [1]:
from baracuda import CudaSourceFile, CudaData
import ctypes

mod = CudaSourceFile("simple_args.cu")
mod.call("simple", CudaData(1234))

getting default context
creating context
CODE:
-------
extern "C" {
__global__ void simple(int n) {
  printf("simple args:\n");
  printf("arg: %d\n", n);
  printf("simple args done.\n");
}
}

-------

Compilation results 
getting default context
Calling function: simple
getting default context


simple args:
arg: 1234
simple args done.


# Graph

In [1]:
from baracuda import CudaSourceFile, CudaGraph

mod = CudaSourceFile("daisy_chain.cu")
g = CudaGraph()
k1 = mod.get_function("k1")
k2 = mod.get_function("k2")
g.add_kernel_node(k1)
g.add_kernel_node(k2)
nodes = g.nodes
print(f"Num of nodes in the graph created manually = {len(nodes)}")
g.run()

getting default context
creating context
CODE:
-------
extern "C" {
__global__ void k1() {
  printf("kernel 1 starting...\n");
  // cudaEvent_t e;
  // cudaEventCreateWithFlags(&e, cudaEventDisableTiming);
  printf("kernel 1 done.\n");
}

__global__ void k2() {
  printf("kernel 2 starting...\n");
  printf("kernel 2 done.\n");
}

}

-------

Compilation results 
getting default context
getting default context
Num of nodes in the graph created manually = 0


kernel 2 starting...
kernel 1 starting...
kernel 2 done.
kernel 1 done.
