<a href="https://colab.research.google.com/github/carlogalli/colab-gpu/blob/main/cuda_structures_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
# Installation cell
%%capture
%%shell
if ! command -v julia 3>&1 > /dev/null
then
    wget -q 'https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz' \
        -O /tmp/julia.tar.gz
    tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
    rm /tmp/julia.tar.gz
fi
julia -e 'using Pkg; pkg"add IJulia; precompile;"'
echo 'Done'

After you run the first cell (the the cell directly above this text), go to Colab's menu bar and select **Edit** and select **Notebook settings** from the drop down. Select *Julia 1.7* in Runtime type. You can also select your prefered harwdware acceleration (defaults to GPU).

In [1]:
# print Julia version
VERSION

v"1.7.2"

In [None]:
using Pkg
Pkg.add(["CUDA", "Random", "Printf", "BenchmarkTools", "Adapt"]);
ENV["JULIA_CUDA_USE_BINARYBUILDER"] = false
using Random, CUDA, Printf, BenchmarkTools, Adapt;

function print_gpu_properties()

    for (i,device) in enumerate(CUDA.devices())
        println("*** General properties for device $i ***")
        name = CUDA.name(device)
        println("Device name: $name")
        major = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
        minor = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
        println("Compute capabilities: $major.$minor")
        clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE)
        println("Clock rate: $clock_rate")
        device_overlap = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)
        print("Device copy overlap: ")
        println(device_overlap > 0 ? "enabled" : "disabled")
        kernel_exec_timeout = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)
        print("Kernel execution timeout: ")
        println(kernel_exec_timeout > 0 ? "enabled" : "disabled")
        # a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
        # d = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)       
        a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
        println("Number of multiprocessors: $a")
        b = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)
        println("Max blocks per MP: $b")
        c = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)        
        println("Max threads per block: $c")
        
        println([a b c a*b*c])
    end
end
print_gpu_properties()
# with the falseENV option it takes 117.357304 seconds (35.94 M allocations: 2.301 GiB, 1.29% gc time, 11.71% compilation time)
# without the falseENV option it takes  124.465413 seconds (39.74 M allocations: 2.537 GiB, 1.27% gc time, 12.65% compilation time)

In [15]:
function test1(vin, vout)
    i = threadIdx().x
    vout[i] = vin[i]*10
    return nothing
end

test1 (generic function with 2 methods)

In [None]:
n = 10
xin = CuArray(rand(n))
xout = CuArray(zeros(n))
@cuda threads=n test1(xin,xout)
[xin xout]

In [29]:
function test2(s)
    i = threadIdx().x
    s.vout[i] = s.vin[i]*10
    return nothing
end

test2 (generic function with 1 method)

In [47]:
struct Model3{T, S}
    n::S
    vin::T
    vout::T
end

import Adapt
function Adapt.adapt_structure(to, model::Model3)
    n = Adapt.adapt_structure(to, model.n)
    vin = Adapt.adapt_structure(to, model.vin)
    vout = Adapt.adapt_structure(to, model.vout)
    Model3(n, vin, vout)
end

In [50]:
n = 10
yin = CuArray(rand(n))
yout = CuArray(zeros(n))
m3 = Model3(n, yin, yout)
[m3.vin m3.vout]

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.727961  0.0
 0.672124  0.0
 0.454851  0.0
 0.154048  0.0
 0.217869  0.0
 0.625703  0.0
 0.807248  0.0
 0.16296   0.0
 0.669908  0.0
 0.612245  0.0

In [51]:
@cuda threads=m3.n test2(m3)
[m3.vin m3.vout]

10×2 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.727961  7.27961
 0.672124  6.72124
 0.454851  4.54851
 0.154048  1.54048
 0.217869  2.17869
 0.625703  6.25703
 0.807248  8.07248
 0.16296   1.6296
 0.669908  6.69908
 0.612245  6.12245

https://discourse.julialang.org/t/arrays-of-arrays-and-arrays-of-structures-in-cuda-kernels-cause-random-errors/69739