# Custom Kernels

In [1]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

#include("perm_test.jl")
include("utils.jl")
#include("kernels/math.jl")
#include("kernels/statistics.jl")

Main.Utils

In [7]:
function square!(out)
    """ x .+= val """
    tidx = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # thread index
    stride = blockDim().x * gridDim().x                       # num. threads per block
    for i = tidx:stride:length(out)
        @inbounds out[i] = out[i]^2
    end
    return
end

out = CUDA.rand(Float64, 100, 100)
target = out.^2
T, B = Utils.set_thread_block(length(out))
@cuda threads=T blocks=B square!(out)
@test isapprox(out, target)

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, target)
   Evaluated: isapprox([0.23465727557171628 0.2485496230384655 … 0.4122893253784012 0.2561438289354201; 0.8819117397407821 0.001518806806245213 … 0.24942242498733067 0.1122157156408838; … ; 0.0010706970945601906 0.21002051256851853 … 0.08826481020023773 0.45853883556029945; 0.00938097613007951 0.7228385415041979 … 0.0051217209305691 0.6483206945775573], [0.23465727557171628 0.2485496230384655 … 0.4122893253784012 0.2561438289354201; 0.8819117397407821 0.001518806806245213 … 0.24942242498733067 0.1122157156408838; … ; 0.0010706970945601906 0.21002051256851853 … 0.08826481020023773 0.45853883556029945; 0.00938097613007951 0.7228385415041979 … 0.0051217209305691 0.6483206945775573])

In [3]:
Threads.nthreads()

8

In [4]:
CUDA.memory_status()

Effective GPU memory usage: 0.73% (59.125 MiB/7.936 GiB)
No memory pool is in use.

In [5]:
N = 2^15                    # num. samples
T, B = set_thread_block(N)
nx, ny = 12, 8              # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

---

## Kernels

In [86]:
Matrix(reshape(collect(1:9), 3, 3))[1,:]

3-element Vector{Int64}:
 1
 4
 7

In [80]:
using Statistics
include("kernels/statistics.jl")
a = CUDA.rand(Float64, 100, 10)
target = var(a, dims=2)
out = PermTestCUDA.var(a)[1]
@test isapprox(target, out)



[32m[1mTest Passed[22m[39m
  Expression: isapprox(target, out)
   Evaluated: isapprox([0.031643789382944466; 0.055284453780260846; … ; 0.08277640566673716; 0.11871314429307622;;], Float32[0.031643815, 0.055284448, 0.091377124, 0.030687094, 0.08767226, 0.08518992, 0.04003726, 0.10078626, 0.0980054, 0.12109852  …  0.07021329, 0.10216088, 0.12837608, 0.13024433, 0.11270077, 0.085193425, 0.078948736, 0.06943999, 0.08277639, 0.11871317])

In [39]:
function square!(out)
    """ x .+= val """
    tidx = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # thread index
    stride = blockDim().x * gridDim().x                       # num. threads per block
    for r = tidx:stride:size(out,1)
        for c = 1:size(out,2)
            @inbounds out[r, c] = out[r, c]^2
        end
    end
    return
end

square! (generic function with 1 method)

In [40]:
out = CUDA.rand(Float64, 100, 10)
target = out.^2
T, B = Utils.set_thread_block(size(out,1))
@cuda threads=T blocks=B square!(out)
@test isapprox(target, out)

[32m[1mTest Passed[22m[39m
  Expression: isapprox(target, out)
   Evaluated: isapprox([0.6830883349302 0.9140290064841784 … 0.17637343025728977 0.0036003048038238433; 0.23035979612363108 0.9108429008292139 … 0.25333633479481776 0.1679700503645073; … ; 0.7853554052148567 0.003943807447645813 … 0.31419032161152644 0.5503153213588036; 0.736625634419431 0.021934806109289547 … 0.6361498940168451 0.4656112816149098], [0.6830883349302 0.9140290064841784 … 0.17637343025728977 0.0036003048038238433; 0.23035979612363108 0.9108429008292139 … 0.25333633479481776 0.1679700503645073; … ; 0.7853554052148567 0.003943807447645813 … 0.31419032161152644 0.5503153213588036; 0.736625634419431 0.021934806109289547 … 0.6361498940168451 0.4656112816149098])

In [2]:
function row_sum!(out, x)
    """ x .+= val """
    tidx = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # thread index
    stride = blockDim().x * gridDim().x                       # num. threads per block
    for r = tidx:stride:length(out)
        for c = 1:size(x,2)
            @inbounds out[r] += x[r,c]
        end
    end
    return
end

row_sum! (generic function with 1 method)

In [20]:
a = CUDA.rand(Float64, 100, 10)
target = sum(a, dims=2)
out = CUDA.zeros(Float64, size(a,1))
T, B = Utils.set_thread_block(size(a,1))
@cuda threads=T blocks=B row_sum!(out, a)
@test isapprox(target, out)

[32m[1mTest Passed[22m[39m
  Expression: isapprox(target, out)
   Evaluated: isapprox([4.513177650579503; 4.707070371097462; … ; 4.16275040476062; 5.027291084577776;;], [4.513177650579504, 4.707070371097462, 4.226460164619164, 3.3186319350571756, 3.6231268319745857, 4.887888647001304, 5.507584044270206, 3.8865381301405915, 5.644757372965104, 4.5393377960755235  …  4.065184384067538, 4.8395204141298676, 5.6013532283906935, 3.1759178872690543, 6.369728590588395, 3.8164773268197774, 3.927456633365984, 4.123006611411046, 4.16275040476062, 5.027291084577776])

In [9]:
out

100-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 4.048030390850599
 5.200961905942131
 5.744431932054138
 6.508670961128315
 5.325939455564748
 4.696683050727755
 6.02965665417655
 5.032443353611986
 6.4608935730751345
 5.2153453976357635
 6.077999765163987
 4.541759223675693
 3.4836409128999586
 ⋮
 4.7296249234684895
 6.829989160851109
 4.504849508609454
 4.644734794130288
 3.198029939254319
 4.2352031453029815
 5.6458122959549115
 4.741192509799889
 4.25002196521156
 5.0967300772110375
 3.0285168049507147
 4.555555410215136

In [20]:
a = CUDA.zeros(10)
@cuda threads=256 blocks=1 add!(a, 1)
a
# all(a .== 1)

10-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

thread 1, block 256
thread 2, block 256
thread 3, block 256
thread 4, block 256
thread 5, block 256
thread 6, block 256
thread 7, block 256
thread 8, block 256
thread 9, block 256
thread 10, block 256
thread 11, block 256
thread 12, block 256
thread 13, block 256
thread 14, block 256
thread 15, block 256
thread 16, block 256
thread 17, block 256
thread 18, block 256
thread 19, block 256
thread 20, block 256
thread 21, block 256
thread 22, block 256
thread 23, block 256
thread 24, block 256
thread 25, block 256
thread 26, block 256
thread 27, block 256
thread 28, block 256
thread 29, block 256
thread 30, block 256
thread 31, block 256
thread 32, block 256
thread 65, block 256
thread 66, block 256
thread 67, block 256
thread 68, block 256
thread 69, block 256
thread 70, block 256
thread 71, block 256
thread 72, block 256
thread 73, block 256
thread 74, block 256
thread 75, block 256
thread 76, block 256
thread 77, block 256
thread 78, block 256
thread 79, block 256
thread 80, block 256
t

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0