# GPU: Monte Carlo computation of $\pi$

In [1]:
using CUDA
using BenchmarkTools

In [2]:
const N = 2^23

8388608

## CPU: naive (for comparison)

In [3]:
function compute_pi(N)
    M = 0 # number of darts that landed in the circle
    for i in 1:N
        if sqrt(rand()^2 + rand()^2) < 1.0
            M += 1
        end
    end
    return 4 * M / N
end

compute_pi (generic function with 1 method)

In [4]:
@btime compute_pi($N) samples = 5 evals = 3;

  34.259 ms (0 allocations: 0 bytes)


## Array programming

In [5]:
function compute_pi_cuda_array(N)
    xs = CUDA.rand(N)
    ys = CUDA.rand(N)
    M = sum(sqrt.(xs .^ 2 .+ ys .^ 2) .< 1)
    return 4 * M / N
end

compute_pi_cuda_array (generic function with 1 method)

In [6]:
compute_pi_cuda_array(N)

3.141599655151367

In [7]:
@btime compute_pi_cuda_array($N) samples = 5 evals = 3;

  487.680 μs (189 allocations: 10.00 KiB)


In [18]:
function compute_pi_cuda_array_alternative(N)
    nelements = CUDA.zeros(N) # only for dispatch (wasteful, of course)
    M = mapreduce(x -> sqrt(rand()^2 + rand()^2) < 1, +, nelements)
    return 4 * M / N
end

compute_pi_cuda_array_alternative (generic function with 1 method)

In [19]:
compute_pi_cuda_array_alternative(N)

3.1411476135253906

In [20]:
@btime compute_pi_cuda_array_alternative($N) samples = 5 evals = 3;

  312.180 μs (128 allocations: 6.14 KiB)


## Kernel: Naive

In [10]:
function _mc_kernel_naive!(counts)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(counts)
        if sqrt(rand()^2 + rand()^2) < 1
            @inbounds counts[i] = 1
        end
    end
    return nothing
end

function compute_pi_kernel_naive(N)
    counts = CUDA.zeros(Int32, N)
    # use occupancy API to figure out threads and blocks
    kernel = @cuda launch = false _mc_kernel_naive!(counts)
    config = launch_configuration(kernel.fun)
    threads = min(N, config.threads)
    blocks = cld(N, threads)
    # launch kernel
    CUDA.@sync kernel(counts; threads, blocks)
    # sum and normalization on CPU
    return 4 * sum(counts) / N
end

compute_pi_kernel_naive (generic function with 1 method)

In [24]:
compute_pi_kernel_naive(N)

3.1414499282836914

In [11]:
@btime compute_pi_kernel_naive($N) samples = 5 evals = 3;

  406.087 μs (138 allocations: 6.59 KiB)


## Kernel: shared memory

In [27]:
const NUM_THREADS_PER_BLOCK = 32

function _mc_kernel_shared!(counts, iter_per_thread)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x

    # shared memory to use within the block of threads
    counter = CuStaticSharedArray(Int32, NUM_THREADS_PER_BLOCK)

    num_hits = 0
    for i in 1:iter_per_thread
        num_hits += sqrt(rand(Float32)^2 + rand(Float32)^2) < 1
    end
    counter[threadIdx().x] = num_hits

    # make sure that all threads in the block are done
    sync_threads()
    if threadIdx().x == 1 # first thread in the block
        for i in 1:NUM_THREADS_PER_BLOCK
            counts[blockIdx().x] += counter[i]
        end
    end
    return nothing
end

function compute_pi_kernel_shared(N; threads=NUM_THREADS_PER_BLOCK, blocks=2^14)
    counts = CUDA.zeros(Int32, blocks)
    iter_per_thread, r = divrem(N, threads * blocks)
    if r > 0
        throw(ArgumentError("N / $(threads * blocks) must be an integer"))
    end

    CUDA.@sync @cuda(
        threads = threads,
        blocks = blocks,
        _mc_kernel_shared!(counts, iter_per_thread)
    )
    return 4 * sum(counts) / N
end

compute_pi_kernel_shared (generic function with 1 method)

In [25]:
compute_pi_kernel_shared(N)

3.142136573791504

In [26]:
@btime compute_pi_kernel_shared($N) samples = 5 evals = 3;

  191.763 μs (149 allocations: 7.12 KiB)


## More Optimizations?

See e.g. https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf