# A simple example on the CPU

In [1]:
N = 2^24
x = fill(1.0f0, N);
y = fill(2.0f0, N);

In [2]:
sizeof(x) / 1e6

67.108864

In [3]:
y .+= x

16777216-element Array{Float32,1}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 ⋮  
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [4]:
using Test
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

# Parallelization on the CPU

In [5]:
function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
sequential_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [6]:
function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y, 2)
parallel_add!(y, x)
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

## Simple Benchmarks

In [7]:
using BenchmarkTools

In [8]:
@btime sequential_add!($y, $x)

  7.100 ms (0 allocations: 0 bytes)


In [9]:
@btime parallel_add!($y, $x)

  5.857 ms (79 allocations: 8.27 KiB)


# GPU computation

In [10]:
using CuArrays

In [11]:
x_d = CuArrays.fill(1.0f0, N);
y_d = CuArrays.fill(2.0f0, N);

In [12]:
y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

┌ Info: Building the CUDAnative run-time library for your sm_61 device, this might take a while...
└ @ CUDAnative /home/alex/.julia/packages/CUDAnative/Lr0yj/src/compiler/rtlib.jl:173


[32m[1mTest Passed[22m[39m

In [13]:
function add_broadcast!(y, x)
    CuArrays.@sync y .+= x
    return
end

@btime add_broadcast!(y_d, x_d)

  1.025 ms (52 allocations: 2.06 KiB)


# Writing a GPU kernel

In [14]:
using CUDAnative

function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [15]:
function bench_gpu1!(y, x)
    CuArrays.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

@btime bench_gpu1!(y_d, x_d)

  2.279 s (22 allocations: 720 bytes)


In [16]:
# Set number of threads to use
nt = 1024

1024

In [17]:
function gpu_add2!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=nt gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [18]:
function bench_gpu2!(y, x)
    CuArrays.@sync begin
        @cuda threads=nt gpu_add2!(y, x)
    end
end

@btime bench_gpu2!(y_d, x_d)

  6.584 ms (46 allocations: 1.38 KiB)


In [19]:
function gpu_add3!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

numblocks = ceil(Int, N/nt)

fill!(y_d, 2)
@cuda threads=nt blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [20]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/nt)
    CuArrays.@sync begin
        @cuda threads=nt blocks=numblocks gpu_add3!(y, x)
    end
end

@btime bench_gpu3!(y_d, x_d)

  4.172 ms (54 allocations: 1.55 KiB)


In [22]:
x_d = nothing
y_d = nothing