# GPU: Monte Carlo computation of $\pi$

In [2]:
using CUDA
using BenchmarkTools

In [6]:
const N = 2^23

8388608

## CPU: naive (for comparison)

In [12]:
function compute_pi(N)
    M = 0 # number of darts that landed in the circle
    for i in 1:N
        if sqrt(rand()^2 + rand()^2) < 1.0
            M += 1
        end
    end
    return 4 * M / N
end

compute_pi (generic function with 1 method)

In [13]:
@btime compute_pi($N) samples = 5 evals = 3;

  34.255 ms (0 allocations: 0 bytes)


## Array programming

In [7]:
function compute_pi_cuda_array(N)
    xs = CUDA.rand(N)
    ys = CUDA.rand(N)
    M = sum(sqrt.(xs .^ 2 .+ ys .^ 2) .< 1)
    return 4 * M / N
end

compute_pi_cuda_array (generic function with 1 method)

In [20]:
compute_pi_cuda_array(N)

3.1406960487365723

In [8]:
@btime compute_pi_cuda_array($N) samples = 5 evals = 3;

  490.830 μs (189 allocations: 10.00 KiB)


In [37]:
function compute_pi_cuda_array_alternative(N)
    xs = CUDA.rand(N)
    M = mapreduce(x -> sqrt(x^2 + rand()^2) < 1, +, xs)
    return 4 * M / N
end

compute_pi_cuda_array_alternative (generic function with 1 method)

In [38]:
compute_pi_cuda_array_alternative(N)

3.140444278717041

In [39]:
@btime compute_pi_cuda_array_alternative($N) samples = 5 evals = 3;

  324.227 μs (125 allocations: 6.08 KiB)


## Kernel: Naive

In [10]:
function _mc_kernel_naive!(counts)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(counts)
        if sqrt(rand()^2 + rand()^2) < 1
            @inbounds counts[i] = 1
        end
    end
    return nothing
end

function compute_pi_kernel_naive(N)
    counts = CUDA.zeros(Int32, N)
    # use occupancy API to figure out threads and blocks
    kernel = @cuda launch = false _mc_kernel_naive!(counts)
    config = launch_configuration(kernel.fun)
    threads = min(N, config.threads)
    blocks = cld(N, threads)
    # launch kernel
    CUDA.@sync kernel(counts; threads, blocks)
    # sum and normalization on CPU
    return 4 * sum(counts) / N
end

compute_pi_kernel_naive (generic function with 1 method)

In [24]:
compute_pi_kernel_naive(N)

3.1414499282836914

In [11]:
@btime compute_pi_kernel_naive($N) samples = 5 evals = 3;

  406.087 μs (138 allocations: 6.59 KiB)


## Kernel: shared memory

In [14]:
function _mc_kernel_shared!(counts, iter_per_thread)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x

    # shared memory to use within the block
    counter = CuStaticSharedArray(Int32, 32)

    num_hits = 0
    for i in 1:iter_per_thread
        num_hits += sqrt(rand(Float32)^2 + rand(Float32)^2) < 1
    end
    @inbounds counter[threadIdx().x] = num_hits

    # make sure that all threads in the block are done
    sync_threads()
    if threadIdx().x == 1 # first thread in the block
        for i in 1:32
            counts[blockIdx().x] += counter[i]
        end
    end
    return nothing
end

function compute_pi_kernel_shared(N; threads=32, blocks=2^14)
    counts = CUDA.zeros(Int32, blocks)
    iter_per_thread, r = divrem(N, threads * blocks)
    if r > 0
        throw(ArgumentError("N / $(threads * blocks) must be an integer"))
    end

    CUDA.@sync @cuda(
        threads = threads,
        blocks = blocks,
        _mc_kernel_shared!(counts, iter_per_thread)
    )
    return 4 * sum(counts) / N
end

compute_pi_kernel_shared (generic function with 1 method)

In [25]:
compute_pi_kernel_shared(N)

3.141425132751465

In [16]:
@btime compute_pi_kernel_shared($N) samples = 5 evals = 3;

  208.413 μs (149 allocations: 7.12 KiB)


## More Optimizations?

See e.g. https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf