<a href="https://colab.research.google.com/github/amontoison/Workshop-GERAD/blob/main/gpu_kernels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parallel computing and GPU programming with Julia
## Part IV: GPU Kernels with KernelAbstractions.jl
Alexis Montoison

In [99]:
import Pkg
Pkg.activate("colab6")
Pkg.add(["CUDA", "KernelAbstractions", "Adapt", "NVTX"])

[32m[1m  Activating[22m[39m project at `/content/colab6`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `/content/colab6/Project.toml`
[32m[1m  No Changes[22m[39m to `/content/colab6/Manifest.toml`


In [100]:
versioninfo()

Julia Version 1.11.5
Commit 760b2e5b739 (2025-04-14 06:53 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 2 × Intel(R) Xeon(R) CPU @ 2.20GHz
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, broadwell)
Threads: 2 default, 0 interactive, 1 GC (on 2 virtual cores)
Environment:
  LD_LIBRARY_PATH = /usr/lib64-nvidia
  JULIA_NUM_THREADS = auto


In [101]:
using CUDA, KernelAbstractions, Adapt

### Different layers of abstraction

In [102]:
N = 100000
a = 0.5
X_cpu = rand(Float64, N)
Y_cpu = zeros(Float64, N)
X = CuVector(X_cpu)
Y = CuVector(Y_cpu)

100000-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

#### Vendor-specific

In [103]:
function saxpy!(a,X,Y)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(Y)
        @inbounds Y[i] = a * X[i] + Y[i]
    end
    return nothing
end

@cuda threads=32 blocks=cld(length(Y), 32) saxpy!(a, X, Y)
Y

100000-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 0.33045790611431836
 0.43026941175472044
 0.2655251196768881
 0.30431028823100975
 0.4984157599454933
 0.06844620328876744
 0.04356821751944773
 0.3039996204893953
 0.3470413837393167
 0.21084902980004128
 0.49739871546275766
 0.26946405756976854
 0.39399730811097383
 ⋮
 0.34720675347596064
 0.4102826154053107
 0.06637767845128689
 0.06131294496972045
 0.17867465569849922
 0.07179257783763593
 0.25818125925969654
 0.28494254924418655
 0.48525902498096957
 0.3075968283124345
 0.45602960422026084
 0.2026006329768757

#### KernelAbstractions

In [104]:
using KernelAbstractions
using CUDA

@kernel function kernel_saxpy!(a, @Const(X), Y)
    I = @index(Global)
    @inbounds Y[I] = a * X[I] + Y[I]
end

kernel_saxpy!(CUDABackend())(a, X, Y, ndrange=length(Y))
Y

100000-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 0.6609158122286367
 0.8605388235094409
 0.5310502393537762
 0.6086205764620195
 0.9968315198909866
 0.13689240657753488
 0.08713643503889545
 0.6079992409787905
 0.6940827674786334
 0.42169805960008255
 0.9947974309255153
 0.5389281151395371
 0.7879946162219477
 ⋮
 0.6944135069519213
 0.8205652308106214
 0.13275535690257378
 0.1226258899394409
 0.35734931139699844
 0.14358515567527186
 0.5163625185193931
 0.5698850984883731
 0.9705180499619391
 0.615193656624869
 0.9120592084405217
 0.4052012659537514

#### Array abstractions

In [105]:
Y .= a .* X .+ Y

100000-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 0.9913737183429551
 1.2908082352641612
 0.7965753590306642
 0.9129308646930292
 1.49524727983648
 0.2053386098663023
 0.13070465255834318
 0.9119988614681858
 1.04112415121795
 0.6325470894001238
 1.492196146388273
 0.8083921727093056
 1.1819919243329216
 ⋮
 1.041620260427882
 1.230847846215932
 0.19913303535386068
 0.18393883490916135
 0.5360239670954976
 0.2153777335129078
 0.7745437777790896
 0.8548276477325596
 1.4557770749429086
 0.9227904849373035
 1.3680888126607824
 0.6078018989306271

### KernelAbstractions.jl

<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA1.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA2.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA3.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA4.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA5.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA6.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA7.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA8.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA9.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA10.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA11.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA12.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA13.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA14.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA15.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA16.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA17.png?raw=1' width='1000px'>
<img src='https://github.com/amontoison/Workshop-GERAD/blob/main/Graphics/KA18.png?raw=1' width='1000px'>

#### Summary -- How to use KernelAbstractions?

- Use `@kernel function mykernel(args...) end` to write a GPU-style program
- Instantiate kernel for a backend `kernel = mykernel(backend)`
- Backends come from Vendor specific libraries
- `KA.allocate(backend, ...)` to obtain memory
- Launch kernel `kernel(args..., ndrange=...)` while specifying the grid to execute over.

In [106]:
function vadd(a, b, c)
    for i in eachindex(c)
        c[i] = a[i] + b[i]
    end
end

a = rand(N)
b = rand(N)
c = similar(a)

vadd(a, b, c)

In [107]:
import KernelAbstractions as KA

@kernel function vadd_kernel(a, b, c)
    i = @index(Global)
    c[i] = a[i] + b[i]
end

In [108]:
backend = CUDABackend()
a = KA.allocate(backend, Float32, N)
b = KA.allocate(backend, Float32, N)
c = similar(a)

vadd_kernel(backend)(a, b, c; ndrange=size(c))

#### Asynchronous operations

GPU operations are asynchronous with regards to the host! They are **ordered** with respect to each other, but special care must be taken when using Julia's task based programming together with GPU programming.

The JuliaGPU ecosystem **synchronizes** the GPU on access, so when you move data from and to the GPU we wait for all the kernels to finish!

When benchmarking you need to synchronize the device!

```julia
@benchmark begin
    vadd_kernel(a, b, c; ndrange=size(c))
    KA.synchronize(backend)
end
```

Otherwise you are only measuring the **launch** of the kernel.

### What makes an application portable?

1. Can I **run** it on a different compute architecture
    1. Different CPU architectures
    2. We live in a mult GPU vendor world
2. Does it **compute** the same thing?
    1. Can I develop on one platform and move to another later?
3. Does it achieve the same **performance**?
4. Can I take advantage of platform **specific** capabilities?

#### Adapt.jl

[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a lightweight dependency that you can use to convert complex structures from CPU to GPU.

```julia
using Adapt
adapt(CuArray, ::Adjoint{Array})::Adjoint{CuArray}
```

```julia
struct Model{T<:Number, AT<:AbstractArray{T}}
   data::AT
end

Adapt.adapt_structure(to, x::Model) = Model(adapt(to, x.data))

cpu = Model(rand(64, 64));
using CUDA

gpu = adapt(CuArray, cpu)
Model{Float64, CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}}(...)
```

## GPU kernel -- transpose

In [109]:
nreps = 3
N = 2048
T = Float32

TILE_DIM = 32
BLOCK_ROWS = 8

8

### Naive kernels

In [110]:
@kernel function simple_copy_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[I, J] = input[I, J]
end

In [111]:
@kernel function simple_transpose_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[J, I] = input[I, J]
end

### Using localmemory

In [112]:
@kernel unsafe_indices = true function lmem_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    I, J = @index(Global, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    @inbounds output[I, J] = tile[i, j]
end

In [113]:
@kernel unsafe_indices = true function lmem_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    # Manually calculate global indexes
    # Later on we need to pivot the group index
    I = (gi - 1) * N + i
    J = (gj - 1) * M + j

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    # Pivot the group index
    I = (gj - 1) * M + i
    J = (gi - 1) * N + j

    @inbounds output[I, J] = tile[j, i]
end

### Local Memory + process multiple elements per lane

In [114]:
using KernelAbstractions.Extras: @unroll

In [115]:
@kernel unsafe_indices=true function coalesced_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[i, j + k]
    end
end

In [116]:
@kernel unsafe_indices = true function coalesced_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    # Transpose block offsets
    I = (gj - 1) * TILE_DIM + i
    J = (gi - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[j + k, i]
    end
end

### Benchmark harness

In [117]:
using NVTX, Random

In [118]:
backend = CUDABackend()

CUDABackend(false, false)

In [119]:
CUDA.@profile for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
    for (name, kernel) in (
            ("copy", simple_copy_kernel!(backend, block_dims)),
            ("transpose", simple_transpose_kernel!(backend, block_dims)),
        )
        NVTX.@range "Simple $name $block_dims" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

Profiler ran for 453.64 ms, capturing 5492 events.

Host-side activity: calling CUDA APIs took 4.74 ms (1.04% of the trace)
┌──────────┬────────────┬───────┬─────────────────────────────────────┬─────────────────────────┐
│[1m Time (%) │[1m Total time │[1m Calls │[1m Time distribution                   │[1m Name                    │
├──────────┼────────────┼───────┼─────────────────────────────────────┼─────────────────────────┤
│    2.64% │[31m   11.96 ms │     6 │   1.99 ms ± 1.39   (  0.29 ‥ 4.05)  │[1m cuStreamSynchronize     │
│    0.11% │[33m  476.84 µs │     6 │  79.47 µs ± 11.75  ( 61.51 ‥ 92.74) │[1m cuModuleLoadDataEx      │
│    0.07% │  309.71 µs │    24 │   12.9 µs ± 10.49  (  4.29 ‥ 39.1)  │ cuLaunchKernel          │
│    0.06% │  264.64 µs │     6 │  44.11 µs ± 4.02   ( 36.95 ‥ 47.45) │ cuModuleGetFunction     │
│    0.04% │  164.03 µs │     6 │  27.34 µs ± 4.36   ( 22.17 ‥ 32.66) │ cudaLaunchKernel        │
│    0.03% │   147.1 µs │    12 │  12.26 µs ± 7.83   (

In [120]:
# Benchmark localmem
CUDA.@profile for (name, kernel) in (
        ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
        ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

Profiler ran for 444.98 ms, capturing 3662 events.

Host-side activity: calling CUDA APIs took 2.22 ms (0.50% of the trace)
┌──────────┬────────────┬───────┬───────────────────────────────────────┬─────────────────────────┐
│[1m Time (%) │[1m Total time │[1m Calls │[1m Time distribution                     │[1m Name                    │
├──────────┼────────────┼───────┼───────────────────────────────────────┼─────────────────────────┤
│    0.81% │[31m     3.6 ms │     4 │ 899.91 µs ± 408.36 (655.17 ‥ 1510.38) │[1m cuStreamSynchronize     │
│    0.14% │[33m  642.06 µs │     4 │ 160.52 µs ± 80.21  ( 89.41 ‥ 242.23)  │[1m cuModuleLoadDataEx      │
│    0.05% │  207.42 µs │    16 │  12.96 µs ± 11.96  (  4.29 ‥ 41.01)   │ cuLaunchKernel          │
│    0.05% │  204.32 µs │     4 │  51.08 µs ± 9.64   ( 44.11 ‥ 65.33)   │ cuModuleGetFunction     │
│    0.02% │   109.2 µs │     8 │  13.65 µs ± 8.01   (  3.81 ‥ 25.03)   │ cuMemAllocFromPoolAsync │
│    0.02% │  103.24 µs │     4 │  25.

In [121]:
# Benchmark localmem + multiple elements per lane
CUDA.@profile for (name, kernel) in (
        ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
        ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)
            # but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove
            # a factor from the size of the array otherwise we get to many blocks
            block_factor = div(TILE_DIM, BLOCK_ROWS)
            ndrange = (N, div(N, block_factor))

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

Profiler ran for 500.16 ms, capturing 3662 events.

Host-side activity: calling CUDA APIs took 2.48 ms (0.50% of the trace)
┌──────────┬────────────┬───────┬───────────────────────────────────────┬─────────────────────────┐
│[1m Time (%) │[1m Total time │[1m Calls │[1m Time distribution                     │[1m Name                    │
├──────────┼────────────┼───────┼───────────────────────────────────────┼─────────────────────────┤
│    0.26% │[31m    1.31 ms │     4 │ 326.34 µs ± 534.42 (  7.63 ‥ 1122.71) │[1m cuStreamSynchronize     │
│    0.19% │[33m  953.44 µs │     4 │ 238.36 µs ± 23.27  (215.05 ‥ 267.51)  │[1m cuModuleLoadDataEx      │
│    0.04% │  212.67 µs │     4 │  53.17 µs ± 11.1   ( 45.06 ‥ 69.14)   │ cuModuleGetFunction     │
│    0.04% │  183.11 µs │    16 │  11.44 µs ± 10.29  (  4.53 ‥ 30.04)   │ cuLaunchKernel          │
│    0.02% │    96.8 µs │     4 │   24.2 µs ± 4.73   ( 20.27 ‥ 30.99)   │ cudaLaunchKernel        │
│    0.02% │   91.79 µs │     8 │  11.

## Matrix multiply

In [129]:
@kernel function naive_matmul_kernel!(output, a, b)
    i, j = @index(Global, NTuple)

    # creating a temporary sum variable for matrix multiplication
    tmp_sum = zero(eltype(output))
    for k in 1:size(a)[2]
        tmp_sum += a[i, k] * b[k, j]
    end

    output[i, j] = tmp_sum
end

In [130]:
# Creating a wrapper kernel for launching with error checks
function naive_matmul!(output, a, b)
    if size(a)[2] != size(b)[1]
        println("Matrix size mismatch!")
        return nothing
    end
    backend = KernelAbstractions.get_backend(a)
    kernel! = naive_matmul_kernel!(backend)
    kernel!(output, a, b, ndrange = size(output))
    return
end

naive_matmul! (generic function with 1 method)

In [131]:
let
  a = rand!(allocate(backend, Float32, 256, 123))
  b = rand!(allocate(backend, Float32, 123, 45))
  output = KernelAbstractions.zeros(backend, Float32, 256, 45)

  naive_matmul!(output, a, b)

  @assert isapprox(output, a * b)
end

In [132]:
@kernel unsafe_indices = true function coalesced_matmul_kernel!(
        output, @Const(A), @Const(B),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]

    # +1 to avoid bank conflicts on shared memory
    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # private variable for tile output
    outval = @private eltype(output) 1
    @inbounds outval[1] = -zero(eltype(output))

    @uniform N = size(output, 1)
    @uniform M = size(output, 2)
    @uniform R = size(A, 2)
    # number of tiles depends on inner dimension
    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)

    # loop over all tiles needed for this calculation
    for t in 0:(NUM_TILES - 1)
        # Can't use @index(Global), because we use a smaller ndrange
        I = (gi - 1) * TILE_DIM + i
        J = (gj - 1) * TILE_DIM + j

        # load inputs into tiles, with bounds checking for non-square matrices
        if I <= N && t * TILE_DIM + j <= R
            @inbounds tile1[i, j] = A[I, t * TILE_DIM + j]
        else
            @inbounds tile1[i, j] = 0.0
        end
        if t * TILE_DIM + i <= R && J <= M
            @inbounds tile2[i, j] = B[t * TILE_DIM + i, J]
        else
            @inbounds tile2[i, j] = 0.0
        end

        # wait for all tiles to be loaded
        @synchronize

        # get global values again
        I = (gi - 1) * TILE_DIM + i
        J = (gj - 1) * TILE_DIM + j

        # calculate value of spot in output, use temporary value to allow for vectorization
        out = zero(eltype(output))
        @simd for k in 1:TILE_DIM
            @inbounds out += tile1[i, k] * tile2[k, j]
        end
        outval[1] += out

        @synchronize
    end

    # get global indices again
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    # save if inbounds
    if I <= N && J <= M
        @inbounds output[I, J] = outval[1]
    end
end

In [133]:
# Creating a wrapper kernel for launching with error checks
function coalesced_matmul!(output, a, b)
    if size(a)[2] != size(b)[1]
        println("Matrix size mismatch!")
        return nothing
    end
    backend = KernelAbstractions.get_backend(a)
    kernel! = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
    kernel!(output, a, b, ndrange = size(output))
    return
end

coalesced_matmul! (generic function with 1 method)

In [134]:
let
  a = rand!(allocate(backend, Float32, 256, 123))
  b = rand!(allocate(backend, Float32, 123, 45))
  output = KernelAbstractions.zeros(backend, Float32, 256, 45)

  coalesced_matmul!(output, a, b)

  @assert isapprox(output, a * b)
end

In [135]:
import LinearAlgebra

### Exercise
- Vary N, R, M
- Vary T

In [136]:
let
    N = 1024
    R = 512
    M = 2048
    T = Float64
    A = rand!(allocate(backend, T, N, R))
    B = rand!(allocate(backend, T, R, M))
    output_naive = KernelAbstractions.zeros(backend, T, N, M)
    output_coalesced = KernelAbstractions.zeros(backend, T, N, M)
    output_mul = KernelAbstractions.zeros(backend, T, N, M)


    CUDA.@profile for _ in 1:nreps
      NVTX.@range "Naive Matmul" begin
          naive_matmul!(output_naive, A, B)
          KernelAbstractions.synchronize(backend)
      end

      NVTX.@range "Coalesced Matmul" begin
          coalesced_matmul!(output_coalesced, A, B)
          KernelAbstractions.synchronize(backend)
      end

      NVTX.@range "LinearAlgebra.mul!" begin
          LinearAlgebra.mul!(output_mul, A, B)
          KernelAbstractions.synchronize(backend)
      end
    end
end

Profiler ran for 508.6 ms, capturing 7434 events.

Host-side activity: calling CUDA APIs took 69.49 ms (13.66% of the trace)
┌──────────┬────────────┬───────┬──────────────────────────────────────┬─────────────────────────┐
│[1m Time (%) │[1m Total time │[1m Calls │[1m Time distribution                    │[1m Name                    │
├──────────┼────────────┼───────┼──────────────────────────────────────┼─────────────────────────┤
│   40.07% │[31m  203.78 ms │     9 │  22.64 ms ± 3.22   ( 18.84 ‥ 26.68)  │[1m cuStreamSynchronize     │
│    0.09% │[33m  446.32 µs │     2 │ 223.16 µs ± 14.84  (212.67 ‥ 233.65) │[1m cuModuleLoadDataEx      │
│    0.03% │[33m  170.23 µs │     6 │  28.37 µs ± 2.71   (  24.8 ‥ 31.95)  │[1m cuLaunchKernel          │
│    0.02% │  119.69 µs │     2 │  59.84 µs ± 13.15  ( 50.54 ‥ 69.14)  │ cuModuleGetFunction     │
│    0.02% │   89.88 µs │     6 │  14.98 µs ± 9.43   (  4.77 ‥ 24.32)  │ cuMemAllocFromPoolAsync │
│    0.02% │   86.07 µs │     6 │  1