# Demonstration: SAXPY on XPU

(XPU stands for CPU and GPU)

This isn't really an exercise but more a demonstration of how to use [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) to write hardware agnostic code. Specifically, the SAXPY implementation in this folder (`saxpy_measurement_xpu.jl` or `saxpy_xpu.ipynb`) is generic and runs on CPUs as well as GPUs by NVIDIA, AMD, and Intel. 

Feel free to study the very short code!

In [1]:
using BenchmarkTools
using Statistics
using Random
using ThreadPinning
using KernelAbstractions

ThreadPinning.pinthreads(:numa)

@kernel function saxpy_kernel(a, @Const(X), Y)
    I = @index(Global)
    @inbounds Y[I] = a * X[I] + Y[I]
end

"""
  measure_membw(; kwargs...) -> membw, flops

Estimate the memory bandwidth (GB/s) by performing a time measurement of a
SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
"""
function measure_membw(backend = CPU(; static = true); verbose = true, N = 1024 * 500_000,
                       dtype = Float32, init = :parallel)
    bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
    flops = 2 * N # num flops in SAXPY

    a = dtype(3.1415)
    if init == :serial
        X = rand!(zeros(dtype, N))
        Y = rand!(zeros(dtype, N))
    else
        X = rand!(KernelAbstractions.zeros(backend, dtype, N))
        Y = rand!(KernelAbstractions.zeros(backend, dtype, N))
    end
    workgroup_size = 1024

    t = @belapsed begin
        kernel = saxpy_kernel($backend, $workgroup_size, $(size(Y)))
        kernel($a, $X, $Y, ndrange = $(size(Y)))
        KernelAbstractions.synchronize($backend)
    end evals=2 samples=10

    mem_rate = bytes * 1e-9 / t # GB/s
    flop_rate = flops * 1e-9 / t # GFLOP/s

    if verbose
        println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits = 2))
        println("\tCompute (GFLOP/s): ", round(flop_rate; digits = 2))
    end
    return mem_rate, flop_rate
end


measure_membw

In [2]:
println("Threads: ", Threads.nthreads())
measure_membw(CPU(; static = true));

Threads: 128


	Memory Bandwidth (GB/s): 334.16


	Compute (GFLOP/s): 55.69


In [3]:
using CUDA
println("GPU: ", name(CUDA.device()))
measure_membw(CUDABackend());

GPU: NVIDIA A100-SXM4-40GB


	Memory Bandwidth (GB/s): 1189.01
	Compute (GFLOP/s): 198.17
