__[nvidia cuda python learning](https://github.com/apowers313/roc/blob/master/experiments/2024.09.02-17.58.36-nvidia-cuda-python-learning/2024.09.02-17.58.36-nvidia-cuda-python-learning.ipynb)__

In [1]:
!date

Wed Sep  4 17:33:20 PDT 2024


In [2]:
# save notebook path before we get started
import os

notebook_path = os.path.abspath("") # not sure if this or os.getcwd() is more reliable

In [1]:
!nvidia-smi

Wed Sep  4 23:54:40 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.107.02             Driver Version: 550.107.02     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...    Off |   00000000:01:00.0 Off |                  N/A |
| 30%   39C    P8              2W /  220W |       2MiB /  12282MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Device Info

In [3]:
from baracuda import CudaDevice

print("Device Count:", CudaDevice.count())

dev = CudaDevice(0)
print("Device Name:", dev.name)
print("Compute Capability:", dev.compute_capability)
print("Driver Version:", dev.driver_version)

Device Count: 1
Device Name: NVIDIA GeForce RTX 4070 SUPER                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
Compute Capability: (8, 9)
Driver Version: (12, 4)


# Simple Dumb Kernel

In [4]:
from baracuda import CudaSourceFile

mod = CudaSourceFile("test_kernel.cu")
mod.call("test_kernel")

getting default context
creating context
CODE:
-------
extern "C" {
#include "test.h"

__global__ void test_kernel() {
  printf("(%d, %d, %d): Block (%d, %d, %d), Thread (%d, %d, %d) -- %d\n", MY_X,
         MY_Y, MY_Z, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,
         threadIdx.y, threadIdx.z, MY_THING);
}
}

-------

Compilation results 
Calling function: test_kernel
getting default context
nv_args 0


# Memory

In [None]:
from baracuda import CudaMemory

CudaMemory(1)

getting default context


<baracuda.CudaMemory at 0x7fbcad0c3fd0>

# Arguments

From example code:

---

https://nvidia.github.io/cuda-python/overview.html#

``` python
NUM_THREADS = 512  # Threads per block
NUM_BLOCKS = 32768  # Blocks per grid

a = np.array([2.0], dtype=np.float32)
n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)
bufferSize = n * a.itemsize

hX = np.random.rand(n).astype(dtype=np.float32)
hY = np.random.rand(n).astype(dtype=np.float32)
hOut = np.zeros(n).astype(dtype=np.float32)

dXclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
dYclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))
dOutclass = checkCudaErrors(cuda.cuMemAlloc(bufferSize))

stream = checkCudaErrors(cuda.cuStreamCreate(0))

checkCudaErrors(cuda.cuMemcpyHtoDAsync(dXclass, hX.ctypes.data, bufferSize, stream))
checkCudaErrors(cuda.cuMemcpyHtoDAsync(dYclass, hY.ctypes.data, bufferSize, stream))

# The following code example is not intuitive
# Subject to change in a future release
dX = np.array([int(dXclass)], dtype=np.uint64)
dY = np.array([int(dYclass)], dtype=np.uint64)
dOut = np.array([int(dOutclass)], dtype=np.uint64)

args = [a, dX, dY, dOut, n]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

checkCudaErrors(
    cuda.cuLaunchKernel(
        kernel,
        NUM_BLOCKS,  # grid x dim
        1,  # grid y dim
        1,  # grid z dim
        NUM_THREADS,  # block x dim
        1,  # block y dim
        1,  # block z dim
        0,  # dynamic shared memory
        stream,  # stream
        args.ctypes.data,  # kernel arguments
        0,  # extra (ignore)
    )
)
```

---

or:

https://github.com/NVIDIA/cuda-python/blob/main/examples/3_CUDA_Features/simpleCudaGraphs_test.py


In [5]:
from baracuda import CudaSourceFile, CudaData
import ctypes

mod = CudaSourceFile("simple_args.cu")
mod.call("simple", CudaData(1234))

getting default context
CODE:
-------
extern "C" {
__global__ void simple(int n) {
  printf("simple args:\n");
  printf("arg: %d\n", n);
  printf("simple args done.\n");
}
}

-------

(0, 0, 0): Block (0, 0, 0), Thread (0, 0, 0) -- 42
Compilation results 
Calling function: simple
getting default context
nv_args ((1234,), (<class 'ctypes.c_uint'>,))


In [28]:
from baracuda import CudaSourceFile, CudaData, CudaMemory
import ctypes
import numpy as np

mod = CudaSourceFile("print_buf.cu")
str = bytearray(b"hi there.")
str.append(0)
arr = np.array(str, dtype=np.uint8)
mem = CudaMemory.from_np(arr)
print("mem hex", hex(mem.nv_memory))
mod.call("print_buf", [CudaData(mem), CudaData(mem.size)])

getting default context
CODE:
-------
extern "C" {
__global__ void print_buf(char *buf, int len) {
  int i;

  printf("buf is %p\n", buf);
  printf("len is %d\n", len);
  printf("buf[0] is %d\n", buf[0]);

  for (i = 0; i < len; i++) {
    printf("buf: %d\n", buf[i]);
  }
  printf("buf as string: %s\n", buf);
  printf("done.\n");
}
}

-------

Compilation results 
getting default context
mem hex 0x7fbc71e00e00
Calling function: print_buf
getting default context
nv_args ((140447341088256, 10), (<class 'ctypes.c_void_p'>, <class 'ctypes.c_uint'>))


buf is 0x7fbc71e00e00
len is 10
buf[0] is 104
buf: 104
buf: 105
buf: 32
buf: 116
buf: 104
buf: 101
buf: 114
buf: 101
buf: 46
buf: 0
buf as string: hi there.
done.


# Graph

In [2]:
from baracuda import CudaSourceFile, CudaGraph, KernelNode

mod = CudaSourceFile("daisy_chain.cu")
g = CudaGraph()
k1 = mod.get_function("k1")
k2 = mod.get_function("k2")
g.add_node(KernelNode(k1))
g.add_node(KernelNode(k2))

print(f"Num of nodes in the graph created manually = {len(g.nodes)}")
print(f"Num of nodes in the graph created manually = {len(g.nv_nodes)}")
g.run()

Exception ignored in: <function CudaSource.__del__ at 0x7f7c218cb380>
Traceback (most recent call last):
  File "/home/apowers/Projects/roc/experiments/2024.09.02-17.58.36-nvidia-cuda-python-learning/baracuda.py", line 514, in __del__
    checkCudaErrors(cuda.cuModuleUnload(self.nv_module))
                                        ^^^^^^^^^^^^^^
AttributeError: 'CudaSourceFile' object has no attribute 'nv_module'


getting default context
CODE:
-------
extern "C" {
__global__ void k1() {
  printf("kernel 1 starting...\n");
  printf("kernel 1 done.\n");
}

__global__ void k2() {
  printf("kernel 2 starting...\n");
  printf("kernel 2 done.\n");
}

__global__ void k3(int n) {
  printf("kernel 3 starting...\n");
  printf("kernel 3 arg: %d", n);
  printf("kernel 3 done.\n");
}

}

-------

Compilation results 
getting default context
getting default context
Num of nodes in the graph created manually = 2
Num of nodes in the graph created manually = 0


kernel 2 starting...
kernel 1 starting...
kernel 2 done.
kernel 1 done.


In [2]:
from baracuda import CudaSourceFile, CudaData, CudaMemory, MemsetNode, KernelNode, CudaDevice, CudaGraph
import ctypes
import numpy as np


mod = CudaSourceFile("print_buf.cu")
fn = mod.get_function("print_buf")

mem = CudaMemory(4)
# str = bytearray(b"hi there.")
# str.append(0)
# arr = np.array(str, dtype=np.uint8)
# mem = CudaMemory.from_np(arr)
# print("mem hex:", hex(mem.nv_memory))

g = CudaGraph()
g.add_node(MemsetNode(mem, 42, mem.size))
# TODO: add dependency
g.add_node(KernelNode(fn, [CudaData(mem), CudaData(mem.size)]))
g.run()

getting default context
CODE:
-------
extern "C" {
__global__ void print_buf(char *buf, int len) {
  int i;

  printf("buf is %p\n", buf);
  printf("len is %d\n", len);
  printf("buf[0] is %d\n", buf[0]);

  for (i = 0; i < len; i++) {
    printf("buf: %d\n", buf[i]);
  }
  printf("buf as string: %s\n", buf);
  printf("done.\n");
}
}

-------

Compilation results 
getting default context
getting default context


buf is 0x7f216f800200
len is 4
buf[0] is 42
buf: 42
buf: 42
buf: 42
buf: 42
buf as string: ****
done.
