<a href="https://colab.research.google.com/github/carlogalli/colab-gpu/blob/main/cuda_structures_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
# Installation cell
%%capture
%%shell
if ! command -v julia 3>&1 > /dev/null
then
    wget -q 'https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz' \
        -O /tmp/julia.tar.gz
    tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
    rm /tmp/julia.tar.gz
fi
julia -e 'using Pkg; pkg"add IJulia; precompile;"'
echo 'Done'

After you run the first cell (the the cell directly above this text), go to Colab's menu bar and select **Edit** and select **Notebook settings** from the drop down. Select *Julia 1.7* in Runtime type. You can also select your prefered harwdware acceleration (defaults to GPU).

In [None]:
# print Julia version
VERSION

v"1.7.2"

In [None]:
using Pkg
Pkg.add(["CUDA", "Random", "Printf", "BenchmarkTools", "Adapt"]);
ENV["JULIA_CUDA_USE_BINARYBUILDER"] = false
using Random, CUDA, Printf, BenchmarkTools, Adapt;

function print_gpu_properties()

    for (i,device) in enumerate(CUDA.devices())
        println("*** General properties for device $i ***")
        name = CUDA.name(device)
        println("Device name: $name")
        major = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
        minor = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
        println("Compute capabilities: $major.$minor")
        clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE)
        println("Clock rate: $clock_rate")
        device_overlap = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)
        print("Device copy overlap: ")
        println(device_overlap > 0 ? "enabled" : "disabled")
        kernel_exec_timeout = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)
        print("Kernel execution timeout: ")
        println(kernel_exec_timeout > 0 ? "enabled" : "disabled")
        # a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
        # d = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)       
        a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
        println("Number of multiprocessors: $a")
        b = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)
        println("Max blocks per MP: $b")
        c = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)        
        println("Max threads per block: $c")
        
        println([a b c a*b*c])
    end
end
print_gpu_properties()
# with the falseENV option it takes 117.357304 seconds (35.94 M allocations: 2.301 GiB, 1.29% gc time, 11.71% compilation time)
# without the falseENV option it takes  124.465413 seconds (39.74 M allocations: 2.537 GiB, 1.27% gc time, 12.65% compilation time)

In [None]:
function test1(vin, vout)
    i = threadIdx().x
    vout[i] = vin[i]*10
    return nothing
end

test1 (generic function with 2 methods)

In [None]:
n = 10
xin = CuArray(rand(n))
xout = CuArray(zeros(n))
@cuda threads=n test1(xin,xout)
[xin xout]

In [None]:
function test2(s)
    i = threadIdx().x
    s.vout[i] = s.vin[i]*10
    return nothing
end

test2 (generic function with 1 method)

In [None]:
struct Model3{T, S}
    n::S
    vin::T
    vout::T
end

import Adapt
function Adapt.adapt_structure(to, model::Model3)
    n = Adapt.adapt_structure(to, model.n)
    vin = Adapt.adapt_structure(to, model.vin)
    vout = Adapt.adapt_structure(to, model.vout)
    Model3(n, vin, vout)
end

In [None]:
n = 10
yin = CuArray(rand(n))
yout = CuArray(zeros(n))
m3 = Model3(n, yin, yout)
[m3.vin m3.vout]

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.727961  0.0
 0.672124  0.0
 0.454851  0.0
 0.154048  0.0
 0.217869  0.0
 0.625703  0.0
 0.807248  0.0
 0.16296   0.0
 0.669908  0.0
 0.612245  0.0

In [None]:
@cuda threads=m3.n test2(m3)
[m3.vin m3.vout]

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.727961  7.27961
 0.672124  6.72124
 0.454851  4.54851
 0.154048  1.54048
 0.217869  2.17869
 0.625703  6.25703
 0.807248  8.07248
 0.16296   1.6296
 0.669908  6.69908
 0.612245  6.12245

https://discourse.julialang.org/t/arrays-of-arrays-and-arrays-of-structures-in-cuda-kernels-cause-random-errors/69739

# Test structures with CUDA

In [None]:
struct TestModel2{T, S}
    n::S
    vin::T
    vout::T
end
function Adapt.adapt_structure(to, model::TestModel1)
    n = Adapt.adapt_structure(to, model.n)
    vin = Adapt.adapt_structure(to, model.vin)
    vout = Adapt.adapt_structure(to, model.vout)
    TestModel1(n, vin, vout)
end

In [None]:
struct TestModel
    n
    vin
    vout
end

import Adapt
function Adapt.adapt_structure(to, model::TestModel)
    n = Adapt.adapt_structure(to, model.n)
    vin = Adapt.adapt_structure(to, model.vin)
    vout = Adapt.adapt_structure(to, model.vout)
    TestModel(n, vin, vout)
end

In [None]:
function test2(s)
    i = threadIdx().x
    s.vout[i] = s.vin[i]*10
    return nothing
end

function build_model(n) 
    vin = CuArray(rand(n))
    vout = CuArray(zeros(n))
    return TestModel(n, vin, vout)
end

build_model (generic function with 1 method)

In [None]:
n = 10
yin = CuArray(rand(n))
yout = CuArray(zeros(n))
m1 = TestModel1(n, yin, yout)

show(IOContext(stdout, :limit=>false), "text/plain", [m1.vin m1.vout])

@cuda threads=m1.n test2(m1)

println("\n"); show(IOContext(stdout, :limit=>false), "text/plain", [m1.vin m1.vout])

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.746335   0.0
 0.0776547  0.0
 0.0546077  0.0
 0.567868   0.0
 0.141722   0.0
 0.456572   0.0
 0.115121   0.0
 0.777527   0.0
 0.600982   0.0
 0.935452   0.0

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.746335   7.46335
 0.0776547  0.776547
 0.0546077  0.546077
 0.567868   5.67868
 0.141722   1.41722
 0.456572   4.56572
 0.115121   1.15121
 0.777527   7.77527
 0.600982   6.00982
 0.935452   9.35452

In [None]:
m2 = build_model(n)

show(IOContext(stdout, :limit=>false), "text/plain", [m2.vin m2.vout])

@cuda threads=m2.n test2(m2)

println("\n"); show(IOContext(stdout, :limit=>false), "text/plain", [m2.vin m2.vout])

In [None]:
function f1(x_cpu)
    convert(CuArray, x_cpu)
    return nothing
end

function f2(x_cpu)
    CuArray(x_cpu)
    return nothing
end
x = rand(10^6)
@btime f1($x)
@btime f2($x)

  1.520 ms (6 allocations: 192 bytes)
  1.523 ms (6 allocations: 192 bytes)


In [None]:
function f3(x,xind)
    xind .= @. x>0.5
    return nothing
end

function f4(x, xind, y)
    xind .= @. x>0.5
    y .= @. xind[4:end] * xind[3:end-1] * xind[2:end-2] * xind[1:end-3]
    return nothing
end

function f5(x, xind, y)
    xind .= @. x>0.5
    y .= @. max(xind[4:end], xind[3:end-1], xind[2:end-2], xind[1:end-3])
    return nothing
end

function f3_cpu(x,xind)
    for i in 1:lastindex(x)
        xind[i]=x[i]>0.5
    end
    return nothing
end

f3_cpu (generic function with 1 method)

In [None]:
n=10^6
a=CuArray(rand(n))
aind=CuArray(zeros(n))
b=CuArray(zeros(n-3))

@time f3(a,aind)
@time f4(a,aind,b)
@time f5(a,aind,b)

  0.038495 seconds (159.34 k allocations: 8.463 MiB, 99.71% compilation time)
  0.090074 seconds (267.92 k allocations: 14.347 MiB, 99.84% compilation time)
  0.091632 seconds (267.92 k allocations: 14.347 MiB, 99.84% compilation time)


In [None]:
@btime f3($a, $aind)

  3.298 μs (11 allocations: 576 bytes)


In [None]:
a_cpu=convert(Vector{Float32},a)
aind_cpu=convert(Vector{Float32},aind)

@btime f3_cpu($a_cpu, $aind_cpu)

  633.252 μs (0 allocations: 0 bytes)
