<a href="https://colab.research.google.com/github/carlogalli/colab-gpu/blob/main/jmp_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
# Installation cell
%%capture
%%shell
if ! command -v julia 3>&1 > /dev/null
then
    wget -q 'https://julialang-s3.julialang.org/bin/linux/x64/1.7/julia-1.7.2-linux-x86_64.tar.gz' \
        -O /tmp/julia.tar.gz
    tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
    rm /tmp/julia.tar.gz
fi
julia -e 'using Pkg; pkg"add IJulia; precompile;"'
echo 'Done'

After you run the first cell (the cell directly above this text), go to Colab's menu bar and select **Edit** and select **Notebook settings** from the drop down. Select *Julia 1.7* in Runtime type. You can also select your prefered harwdware acceleration (defaults to GPU).

In [1]:
VERSION   # print Julia version

v"1.7.2"

In [None]:
using Pkg
Pkg.add(["CUDA", "Random", "Distributions", "Printf", "BenchmarkTools", "PyPlot", "PrettyTables"]);
#ENV["JULIA_CUDA_USE_BINARYBUILDER"] = false
using Random, Distributions, CUDA, Printf, BenchmarkTools, PyPlot, PrettyTables;

function print_gpu_properties()

    for (i,device) in enumerate(CUDA.devices())
        println("*** General properties for device $i ***")
        name = CUDA.name(device)
        println("Device name: $name")
        major = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
        minor = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
        println("Compute capabilities: $major.$minor")
        clock_rate = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_CLOCK_RATE)
        println("Clock rate: $clock_rate")
        device_overlap = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)
        print("Device copy overlap: ")
        println(device_overlap > 0 ? "enabled" : "disabled")
        kernel_exec_timeout = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)
        print("Kernel execution timeout: ")
        println(kernel_exec_timeout > 0 ? "enabled" : "disabled")
        # a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)
        # d = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)       
        a = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
        println("Number of multiprocessors: $a")
        b = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)
        println("Max blocks per MP: $b")
        c = CUDA.attribute(device, CUDA.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)        
        println("Max threads per block: $c")
        
        println([a b c a*b*c])
    end
end
    
@time print_gpu_properties()
# with the falseENV option it takes 117.357304 seconds (35.94 M allocations: 2.301 GiB, 1.29% gc time, 11.71% compilation time)
# without the falseENV option it takes  124.465413 seconds (39.74 M allocations: 2.537 GiB, 1.27% gc time, 12.65% compilation time)


#Pkg.add("CpuId")
#using CpuId
#cpuinfo()

# JMP functions

plot_style (generic function with 1 method)

# Execute code

In [23]:
@time main_gpu(Model_GPU(nB=51, ny=21), max_iter=100, print_iter=5)

iter 5, error 3.83e-01
iter 10, error 1.41e-01
iter 15, error 6.09e-02
iter 20, error 2.27e-02
iter 25, error 8.68e-03
iter 30, error 3.36e-03
iter 35, error 1.31e-03
iter 40, error 5.16e-04
iter 45, error 2.03e-04
iter 50, error 8.01e-05
iter 55, error 3.19e-05
iter 60, error 1.29e-05
vfi ends with 62 iters, error 9.06e-06
  3.510334 seconds (2.53 M allocations: 117.772 MiB, 0.86% gc time, 66.95% compilation time)


In [None]:
@time main_gpu(Model_GPU(nB=51, ny=21, T_sim=10^6, T_discard_sim=100), max_iter=100, print_iter=0, simulate=true)

In [None]:
@time main_gpu_timed(Model_GPU(nB=51, ny=21, T_sim=10^6, T_discard_sim=100), max_iter=100, print_iter=0, simulate=true)

# Debug various things

Let's try to make the simulation faster.

In [None]:
# Find best way to take a mean. Mean > vector multiplication and sum  >> indexing!

function f4(A)
    inds = @. A>0.3
    sum(A.*inds)/sum(inds)
    return
end

function f5(A)  # This gives the wrong result!
    inds = @. A>0.3
    return mean(A.*inds)
end

function f6(A::Vector)
    inds = @. A>0.3
    mean(A[inds])
    return
end

x = rand(10^6)
y = CuArray(x)

f4(y), f6(x)

(0.20193858799072076, 0.6501436064712264, nothing, 0.4552552587081986, nothing)

In [None]:
@btime f4($y)
@btime f6($x)

  213.583 μs (112 allocations: 5.66 KiB)
  2.254 ms (6 allocations: 5.47 MiB)


In [None]:
function f1(A::Array)
    return std(A)
end

function f2(A::CuArray)
    mu = mean(A)
    return sqrt(sum(@. (A-mu)^2)/(length(A)-1))
end

f1(x)-f2(y)

-5.551115123125783e-17

In [None]:
@btime f1(x)
@btime f2(y)

  444.599 μs (1 allocation: 16 bytes)
  193.677 μs (99 allocations: 4.95 KiB)


0.2887386363105266

In [None]:
# This shows that indexing CuArrays is tragically slow and should be avoided at all costs
function test(X)
    t = 0.
    for i in 1:lastindex(X)
      t += X[i]
    end
    return t
end

A = rand(Float32, 10^6)
B = CuArray(A)
nothing

In [None]:
@btime test($A)
@btime test($B)

  1.259 ms (1 allocation: 16 bytes)
  10.105 s (3000001 allocations: 457.76 MiB)


499931.26923942566

In [None]:
# This shows that different methods to copy a CuArray into a new Array are identical
function test1(X::CuArray)
    Y = Vector{Float32}(X)
    return
end

function test2(X::CuArray)
    Y = convert(Vector{Float32}, X)
    return
end

function test3(X::CuArray)
    Y = Vector{Float32}(undef, length(X))
    copyto!(Y, X)
    return
end

A = CuArray(rand(Float32, 10^6))

@btime test1($A)
@btime test2($A)
@btime test3($A)

  3.356 ms (5 allocations: 11.44 MiB)
  3.333 ms (5 allocations: 11.44 MiB)
  3.359 ms (5 allocations: 11.44 MiB)


In [None]:
A = CuArray(rand(Float32, 10^6))
B = CuArray(rand(Float32, 10^6))
tup_gpu = A,B
typeof(tup_gpu)
isa(tup_gpu, Tuple{CuArray, CuArray})
tup_cpu = convert(Tuple{Array, Array}, tup_gpu)
[tup_cpu[1] tup_cpu[2]]

C, D = convert(Tuple{Array, Array}, tup_gpu)

typeof(tup_gpu)
isa(tup_gpu, NTuple{2, CuArray})

In [None]:
CUDA.allowscalar(false)

function cor_gpu(a, b, c)
    is = a .> 0.5
    return cov(b.*is, c.*is)/(std(b.*is)*std(c.*is))
end

function test_cpu(a, b, c)
    is = a .> 0.5
    T=sum(is)
    #return cor(b[is], c[is]), cor(b.*is, c.*is)
    return mean(b.*is), sum(b.*is)/T, mean(b[is])
end
function test_gpu(a, b, c)
    is = a .> 0.5
    T=sum(is)
    #return cor(b[is], c[is]), cor(b.*is, c.*is)
    return mean(b.*is), sum(b.*is)/T, mean(b[is])
end

test_gpu (generic function with 1 method)

In [None]:
x, y, z = [rand(100) for i in 1:3]
X=CuArray(x)
Y=CuArray(y)
Z=CuArray(z);

In [None]:
#test_cpu(x,y,z), test_gpu(X, Y, Z)
test_cpu(x,y,z), test_gpu(X,Y,Z)

((0.29944589350082085, 0.5545294324089275, 0.5545294324089275), (0.29944589350082085, 0.5545294324089275, 0.5545294324089275))