In [12]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

include("perm_test.jl")
include("utils.jl")
#include("kernels/math.jl")
include("kernels/statistics.jl")



Main.PermTestCUDA

In [2]:
Threads.nthreads()

8

In [3]:
CUDA.memory_status()

Effective GPU memory usage: 0.73% (59.125 MiB/7.936 GiB)
No memory pool is in use.

---

In [4]:
N = 100
nx, ny = 12, 8

# initialize fake data
Random.seed!(1)
x = CuArray(rand(N, nx))
y = CuArray(rand(N, ny))
px, py = Utils.partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [6]:
x_cpu, y_cpu = Array(x), Array(y)
wide = CuArray(PermTest.tconf(x_cpu, y_cpu; pooled=false, alpha=0.01))
narrow = CuArray(PermTest.tconf(x_cpu, y_cpu; pooled=false, alpha=0.1))

100-element CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}:
 (-0.20033566186130905, 0.27897686653156795)
 (-0.21092714231376697, 0.29001194478888614)
 (-0.478509716590083, 0.013872684894141757)
 (-0.008643089840340068, 0.4736660337116226)
 (-0.1738778364681788, 0.31429385634148366)
 (-0.1737027099561762, 0.3362213214785722)
 (-0.481418964359429, 0.09367501082054275)
 (-0.07405379502984116, 0.40440372666416197)
 (-0.1896253945861558, 0.23516390332487835)
 (-0.16725165087659033, 0.3417676319053423)
 (-0.3966413398907492, 0.16929304287733477)
 (-0.34992751805969635, 0.10506511526771004)
 (-0.04721959277298873, 0.43480398345735904)
 ⋮
 (-0.30315138724790025, 0.1726538504404105)
 (-0.21473653925959513, 0.1917261009848089)
 (-0.05116754487991704, 0.45782022384867016)
 (0.0010848296123596168, 0.5704888096241005)
 (-0.1166337686298452, 0.27000962068316625)
 (-0.24842310752681795, 0.25189770165251457)
 (-0.15606822020907735, 0.2420074589105337)
 (-0.2630251815849355, 0.2335431979957

In [7]:
function solve!(out, x, y, px, py, delta_true, wide, narrow)
    tidx = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # thread index
    stride = blockDim().x * gridDim().x                       # num. threads per block
    for i = tidx:stride:length(out)
        @inbounds out[i] = PermTest.permInterval(x[i,:], y[i,:], px, py, delta_true, wide[i], narrow[i])
    end
    return
end

solve! (generic function with 1 method)

In [None]:
i = 1
CUDA.allowscalar(true)
PermTestCUDA.permInterval(x[i,:], y[i,:], px, py, 0, wide[i], narrow[i])

In [10]:
i = 1
CUDA.allowscalar(true)
PermTest.permInterval(x[i,:], y[i,:], px, py, 0, wide[i], narrow[i])

false

In [None]:
out = zeros(Bool, N)
for i in 1:N
    out[i] = PermTestCUDA.permInterval(x[i,:], y[i,:], px, py, 0, wide[i], narrow[i])
end
sum(out) / length(out)

In [None]:
out = CUDA.zeros(Bool, N)
T, B = Utils.set_thread_block(N)
@cuda threads=T blocks=B solve!(out, x, y, px, py, 0, CuArray(wide), CuArray(narrow))