In [None]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

include("perm_test.jl")
include("utils.jl")
#include("kernels/math.jl")
include("kernels/statistics.jl")

In [None]:
Threads.nthreads()

In [None]:
CUDA.memory_status()

---

In [None]:
N = 100
nx, ny = 12, 8

# initialize fake data
Random.seed!(1)
x = CuArray(rand(N, nx))
y = CuArray(rand(N, ny))
px, py = Utils.partition(nx, ny)
px, py = CuArray(px), CuArray(py)

In [None]:
x_cpu, y_cpu = Array(x), Array(y)
wide = PermTest.tconf(x_cpu, y_cpu; pooled=false, alpha=0.01)
narrow = PermTest.tconf(x_cpu, y_cpu; pooled=false, alpha=0.1)

In [None]:
function solve!(out, x, y, px, py, delta_true, wide, narrow)
    tidx = (blockIdx().x - 1) * blockDim().x + threadIdx().x  # thread index
    stride = blockDim().x * gridDim().x                       # num. threads per block
    for i = tidx:stride:length(out)
        @inbounds out[i] = PermTestCUDA.permInterval(x[i,:], y[i,:], px, py, delta_true, wide[i], narrow[i])
    end
    return
end

In [None]:
i = 1
PermTestCUDA.permInterval(x[i,:], y[i,:], px, py, 0, wide[i], narrow[i])

In [None]:
out = zeros(Bool, N)
for i in 1:N
    out[i] = PermTestCUDA.permInterval(x[i,:], y[i,:], px, py, 0, wide[i], narrow[i])
end
sum(out) / length(out)

In [None]:
out = CUDA.zeros(Bool, N)
T, B = Utils.set_thread_block(N)
@cuda threads=T blocks=B solve!(out, x, y, px, py, 0, CuArray(wide), CuArray(narrow))