# Cupy Tutorial

## Cupy Specific Function: Scatter Add

https://docs.cupy.dev/en/stable/reference/ext.html

In [1]:
# Import necessary libraries
import cupy as cp
import numpy as np
import cupyx as cpx

# Set array size
array_size = 10000
result_array_size = 10

# Generate random values and indices
values = np.random.rand(array_size).astype(np.float32)
indices = np.random.randint(0, result_array_size, size=array_size, dtype=np.int32)

# Create Cupy arrays
gpu_values = cp.array(values)
gpu_indices = cp.array(indices)
gpu_result_scatter_add_0 = cp.zeros(result_array_size, dtype=np.float32)
gpu_result_scatter_add_1 = cp.zeros(result_array_size, dtype=np.float32)

# Use cupyx.scatter_add to perform scatter add
cpx.scatter_add(gpu_result_scatter_add_0, gpu_indices, gpu_values)
cpx.scatter_add(gpu_result_scatter_add_1, gpu_indices, gpu_values)
cp.cuda.stream.get_current_stream().synchronize()

# Transfer GPU result array back to NumPy
result_array_scatter_add_0 = gpu_result_scatter_add_0.get()
result_array_scatter_add_1 = gpu_result_scatter_add_1.get()

# Print the result
print("\nArray after scatter add operation 0:")
print(result_array_scatter_add_0)

print("\nArray after scatter add operation 1:")
print(result_array_scatter_add_1)

print("\nDifference between scatter add 0 and 1:")
print(result_array_scatter_add_0 - result_array_scatter_add_1)
print("Oh no, there is a difference!!!")


Array after scatter add operation 0:
[484.18228 535.9695  489.6193  500.85852 505.03873 500.55252 508.4838
 481.9326  495.263   472.4387 ]

Array after scatter add operation 1:
[484.1823  535.9696  489.61935 500.8586  505.0389  500.55258 508.4836
 481.9327  495.263   472.439  ]

Difference between scatter add 0 and 1:
[-3.0517578e-05 -1.2207031e-04 -6.1035156e-05 -9.1552734e-05
 -1.8310547e-04 -6.1035156e-05  1.8310547e-04 -1.2207031e-04
  0.0000000e+00 -3.0517578e-04]
Oh no, there is a difference!!!


## Custom Kernels: Atomic Add

https://docs.cupy.dev/en/stable/reference/kernel.html

In [2]:
# Import necessary libraries
import cupy as cp
import numpy as np

# Set up a simple kernel function to demonstrate atomic add
kernel_code = """
extern "C"
__global__ void scatter_add_example_kernel(float* result, const float* values, const int* indices, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        atomicAdd(&result[indices[idx]], values[idx]);
    }
}
"""

# Compile the kernel
kernel = cp.RawKernel(kernel_code, "scatter_add_example_kernel")

# Create Cupy arrays
gpu_result_atomic_add_0 = cp.zeros(result_array_size, dtype=np.float32)
gpu_result_atomic_add_1 = cp.zeros(result_array_size, dtype=np.float32)

# Set up grid and block dimensions for the kernel launch
block_size = 256
grid_size = (array_size + block_size - 1) // block_size

# Launch the kernel
kernel(
    (grid_size,),
    (block_size,),
    (gpu_result_atomic_add_0, gpu_values, gpu_indices, array_size),
)
kernel(
    (grid_size,),
    (block_size,),
    (gpu_result_atomic_add_1, gpu_values, gpu_indices, array_size),
)
cp.cuda.stream.get_current_stream().synchronize()

# Transfer GPU result array back to NumPy
result_array_atomic_add_0 = gpu_result_atomic_add_0.get()
result_array_atomic_add_1 = gpu_result_atomic_add_1.get()

print("\nArray after atomic add operation:")
print(result_array_atomic_add_0)

print("\nArray after atomic add operation:")
print(result_array_atomic_add_1)

print("\nDifference between atomic add 0 and 1:")
print(result_array_atomic_add_0 - result_array_atomic_add_1)
print("Oh no, there is a difference!!!")

print("\nDifference atomic add 0 and scatter add 0:")
print(result_array_scatter_add_0 - result_array_atomic_add_0)
print("Oh no, there is a difference!!!")


Array after atomic add operation:
[484.1824  535.96936 489.61902 500.85855 505.03894 500.55267 508.48392
 481.93298 495.2631  472.43887]

Array after atomic add operation:
[484.18277 535.9696  489.6193  500.8587  505.03888 500.55255 508.48395
 481.93256 495.26318 472.4389 ]

Difference between atomic add 0 and 1:
[-3.6621094e-04 -2.4414062e-04 -2.7465820e-04 -1.5258789e-04
  6.1035156e-05  1.2207031e-04 -3.0517578e-05  4.2724609e-04
 -9.1552734e-05 -3.0517578e-05]
Oh no, there is a difference!!!

Difference atomic add 0 and scatter add 0:
[-1.2207031e-04  1.2207031e-04  2.7465820e-04 -3.0517578e-05
 -2.1362305e-04 -1.5258789e-04 -1.2207031e-04 -3.9672852e-04
 -9.1552734e-05 -1.8310547e-04]
Oh no, there is a difference!!!


## Grid and Block Size

- https://www.microway.com/hpc-tech-tips/cuda-parallel-thread-management/
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#abstract
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
