# Setup

In [1]:
Threads.nthreads()

8

In [2]:
using Distributed, Folds, FLoops
using Random, Distributions

In [3]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [4]:
n, n1, n2 = 256, 12, 8
d = Normal()
delta_true = 0
parts = partition(n1, n2)
Random.seed!(123)
x1s = rand(d, (n, n1))
x2s = rand(d, (n, n2))
data = hcat(x1s, x2s)

256×20 Matrix{Float64}:
 -0.645731     1.34194    -0.19564   -1.05875    …  -1.35274    -0.370654
 -1.46325     -1.18862    -0.915458  -0.920206       1.14854    -0.921302
 -1.6236       1.18954    -0.167391   2.08291        0.297835   -0.483097
 -0.217665    -1.6712     -0.75522   -0.281233       0.425549    1.91035
  0.492246     0.190436   -1.00009    0.0599196     -0.260339   -1.41726
  0.98098     -1.27217    -0.498621   0.179808   …   1.58057    -0.460531
  0.0799568   -0.803196    0.802551  -0.437841      -0.936614   -0.243236
  1.54912      1.97076     0.146786   0.0807004     -1.3201      1.90304
 -1.34161      1.54823    -0.922078   1.58911       -0.114953    0.109298
  0.412162    -0.126723    1.22042    0.668593      -0.399888   -0.910424
  0.593197     0.174584   -0.223876   0.749044   …  -0.372004   -0.0788338
 -0.768409     0.267046   -0.966475  -0.163043      -1.16322    -0.253188
 -0.0761679    1.326      -0.950514   1.42283        1.16112     0.874681
  ⋮             

In [5]:
x1 = [0.69646919, 0.28613933, 0.22685145, 0.55131477, 0.71946897]
x2 = [0.42310646, 0.9807642 , 0.68482974, 0.4809319 , 0.39211752]
tconf(x1, x2)  # (-0.4422928901381373, 0.2496904486215163)

(-0.41835631489929964, 0.2257538708992997)

In [6]:
ttest_ind(x1, x2, true)

1-element Vector{Float64}:
 -0.6418391945147361

In [7]:
pooled, alpha, alternative = true, 0.05, "two-sided"
parts1, parts2 = partition(length(x1), length(x2))

([1 2 … 4 5; 1 2 … 4 6; … ; 5 7 … 9 10; 6 7 … 9 10], [10 9 … 7 6; 10 9 … 7 5; … ; 6 4 … 2 1; 5 4 … 2 1])

In [13]:
permInterval(x1, x2, parts1, parts2, 0)

true

In [63]:
# p_start = 0.023809523809523808, p_end= 0.09523809523809523
# p_start = 0.12698412698412698,  p_end= 0.007936507936507936

---

# CPU Programming

In [None]:
x = data[1,:]
x1, x2 = x[1:n1], x[n1+1:end]
@time permInterval(x1, x2, parts, delta_true)

In [None]:
@time @distributed (+) for i in 1:n
    permInterval(data[i, 1:n1], data[i, n1+1:end], parts, delta_true)
end

In [None]:
@time @floop for row in eachrow(data)
    @reduce(coverage += permInterval(row[1:n1], row[n1+1:end], parts, delta_true))
end

In [None]:
@time Folds.reduce(+, Folds.map(x -> permInterval(x[1:n1], x[n1+1:end], parts, delta_true), eachrow(data)))

---

# GPU Programming

In [14]:
using CUDA, Test, Random

N = 2^10
n1, n2 = 12, 8
Random.seed!(123)
x1 = CuArray(rand(N, n1))
x2 = CuArray(rand(N, n2))

1024×8 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.919181   0.292579   0.825846   …  0.0617123  0.741307   0.979005
 0.426019   0.149323   0.489151      0.976627   0.772621   0.414016
 0.746586   0.188196   0.699858      0.840765   0.314378   0.474786
 0.819201   0.566957   0.539838      0.120534   0.440371   0.926076
 0.954159   0.0496869  0.948309      0.999903   0.911686   0.574866
 0.845895   0.506965   0.0744135  …  0.635452   0.706518   0.824061
 0.586749   0.0930013  0.0117718     0.771802   0.915362   0.303661
 0.121813   0.997974   0.398319      0.674117   0.935987   0.963485
 0.789493   0.709287   0.143511      0.0295761  0.993146   0.265231
 0.619259   0.910902   0.241169      0.533718   0.874191   0.0167825
 0.477645   0.436953   0.220376   …  0.373497   0.109393   0.71381
 0.804193   0.673175   0.129352      0.875639   0.503859   0.0513694
 0.123538   0.6756     0.303669      0.694664   0.481859   0.681448
 ⋮                                ⋱  ⋮                     
 0.7

## Student T Distribution

Note that this only covers computing the [T-test stastic](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html) and not calculations involving the actual T distribution (e.g., cdf, pdf, quantiles).

In [74]:
function mean_gpu(x, d)
    return sum(x, dims=d) ./= size(x)[d]
end

function sum_sq_gpu(x, d)
    return sum(x.^2, dims=d)
end

function var_gpu(x)
    d = ndims(x)
    n = size(x)[d]
    m = mean_gpu(x, d)
    ss = sum_sq_gpu(x, d)
    return var_gpu(n, m, ss)
end

function var_gpu(n, mean, ss)
    return @. (ss - (n * mean^2)) / (n - 1)
end

var_gpu (generic function with 4 methods)

In [60]:
using Statistics

@test all(mean_gpu(x1, ndims(x1)) .== Statistics.mean(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: all(mean_gpu(x1, ndims(x1)) .== Statistics.mean(x1, dims = 2))

In [61]:
@test all(sum_sq_gpu(x1, ndims(x1)) .== sum(x1.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: all(sum_sq_gpu(x1, ndims(x1)) .== sum(x1 .^ 2, dims = 2))

In [75]:
d = ndims(x1)
n = size(x1)[d]
m = mean_gpu(x1, d)
ss = sum_sq_gpu(x1, d)
@test isapprox(var_gpu(n, m, ss), Statistics.var(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(var_gpu(n, m, ss), Statistics.var(x1, dims = 2))
   Evaluated: isapprox([0.08112273805746789; 0.056394516192776356; … ; 0.08123599538480565; 0.08010810137533932;;], [0.08112273805746789; 0.05639451619277636; … ; 0.08123599538480553; 0.08010810137533936;;])

In [76]:
@test isapprox(var_gpu(x1), Statistics.var(x1, dims=2))  # test overloading

[32m[1mTest Passed[22m[39m
  Expression: isapprox(var_gpu(x1), Statistics.var(x1, dims = 2))
   Evaluated: isapprox([0.08112273805746789; 0.056394516192776356; … ; 0.08123599538480565; 0.08010810137533932;;], [0.08112273805746789; 0.05639451619277636; … ; 0.08123599538480553; 0.08010810137533936;;])

In [79]:
function t_gpu(x1, x2, pooled)
    d = ndims(x1)
    mean1, mean2 = mean_gpu(x1, d), mean_gpu(x2, d)
    ss1, ss2 = sum_sq_gpu(x1, d), sum_sq_gpu(x2, d)
    n1, n2 = size(x1)[d], size(x2)[d]
    # TODO implement pooled variance
    var1, var2 = var_gpu(n1, mean1, ss1), var_gpu(n2, mean2, ss2)
    return @. (mean1 - mean2) / sqrt(var1/n1 + var2/n2)
end

t_gpu (generic function with 1 method)

In [80]:
@test isapprox(t_gpu(x1, x2, false), ttest_ind(x1, x2, false))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x1, x2, false), ttest_ind(x1, x2, false))
   Evaluated: isapprox([-0.8083415417660851; -0.022399183457034788; … ; 0.2929699077440149; -0.7860528801237276;;], [-0.808341541766085; -0.022399183457034778; … ; 0.29296990774401493; -0.7860528801237273;;])

## Permutation Test p-value

In [85]:
parts1, parts2 = partition(n1, n2)
parts1, parts2 = CuArray(parts1), CuArray(parts2)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [88]:
pval(x1[1,:], x2[1,:], parts1, parts2)

0.4248551242359292

## Binary Search

In [89]:
x1_cpu = Matrix(x1)
x2_cpu = Matrix(x2)

1024×8 Matrix{Float64}:
 0.919181   0.292579   0.825846   …  0.0617123  0.741307   0.979005
 0.426019   0.149323   0.489151      0.976627   0.772621   0.414016
 0.746586   0.188196   0.699858      0.840765   0.314378   0.474786
 0.819201   0.566957   0.539838      0.120534   0.440371   0.926076
 0.954159   0.0496869  0.948309      0.999903   0.911686   0.574866
 0.845895   0.506965   0.0744135  …  0.635452   0.706518   0.824061
 0.586749   0.0930013  0.0117718     0.771802   0.915362   0.303661
 0.121813   0.997974   0.398319      0.674117   0.935987   0.963485
 0.789493   0.709287   0.143511      0.0295761  0.993146   0.265231
 0.619259   0.910902   0.241169      0.533718   0.874191   0.0167825
 0.477645   0.436953   0.220376   …  0.373497   0.109393   0.71381
 0.804193   0.673175   0.129352      0.875639   0.503859   0.0513694
 0.123538   0.6756     0.303669      0.694664   0.481859   0.681448
 ⋮                                ⋱  ⋮                     
 0.722624   0.636694   0.459694

In [90]:
p1, p2 = Matrix(parts1), Matrix(parts2)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [93]:
permInterval(x1[1,:], x2[1,:], parts1, parts2, 0, pooled=false)

true

In [91]:
permInterval(x1_cpu[1,:], x2_cpu[1,:], p1, p2, 0, pooled=false)

true

---

In [None]:
function gpu_sum!(x, y, )
    """Computes the sum of the 1-D vector x and stores in y[1]"""
    index = (blockIdx().x-1) * blockDim().x + threadIdx().x
    stride = gridDim().x * blockDim().x
    for i = index:stride:length(x)
        @inbounds y[1] += x[i]
    end
end

In [None]:
numblocks = ceil(Int, N/256)
@cuda threads=256 blocks=numblocks gpu_sum!(x, y)
#@test all(sum(x) .== y)
@test all(x .== y)

In [None]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

In [None]:
using BenchmarkTools

@btime bench_gpu3!($y_d, $x_d)